if (!require("nnet")) install.packages("nnet")
## Caricamento del pacchetto richiesto: nnet
if (!require("MASS")) install.packages("MASS")
## Caricamento del pacchetto richiesto: MASS
if (!require("e1071")) install.packages("e1071")
## Caricamento del pacchetto richiesto: e1071
if (!require("class")) install.packages("class")
## Caricamento del pacchetto richiesto: class
if (!require("leaps")) install.packages("leaps")
## Caricamento del pacchetto richiesto: leaps
if (!require("glmnet")) install.packages("glmnet")
## Caricamento del pacchetto richiesto: glmnet
## Caricamento del pacchetto richiesto: Matrix
## Loaded glmnet 4.1-8
if (!require("car")) install.packages("car")
## Caricamento del pacchetto richiesto: car
## Caricamento del pacchetto richiesto: carData
if (!require("caTools")) install.packages("caTools")
## Caricamento del pacchetto richiesto: caTools
if (!require("mgcv")) install.packages("mgcv")
## Caricamento del pacchetto richiesto: mgcv
## Caricamento del pacchetto richiesto: nlme
## This is mgcv 1.9-0. For overview type 'help("mgcv-package")'.
##
## Caricamento pacchetto: 'mgcv'
## Il seguente oggetto è mascherato da 'package:nnet':
##
## multinom
if (!require("summarytools")) install.packages("summarytools")
## Caricamento del pacchetto richiesto: summarytools
if (!require("dplyr")) install.packages("dplyr")
## Caricamento del pacchetto richiesto: dplyr
##
## Caricamento pacchetto: 'dplyr'
## Il seguente oggetto è mascherato da 'package:nlme':
##
## collapse
## Il seguente oggetto è mascherato da 'package:car':
##
## recode
## Il seguente oggetto è mascherato da 'package:MASS':
##
## select
## I seguenti oggetti sono mascherati da 'package:stats':
##
## filter, lag
## I seguenti oggetti sono mascherati da 'package:base':
##
## intersect, setdiff, setequal, union
if (!require("ggplot2")) install.packages("ggplot2")
## Caricamento del pacchetto richiesto: ggplot2
if (!require("tidyverse")) install.packages("tidyverse")
## Caricamento del pacchetto richiesto: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::collapse() masks nlme::collapse()
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ dplyr::recode() masks car::recode()
## ✖ dplyr::select() masks MASS::select()
## ✖ purrr::some() masks car::some()
## ✖ tidyr::unpack() masks Matrix::unpack()
## ✖ tibble::view() masks summarytools::view()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
if (!require("lubridate")) install.packages("lubridate")
if (!require("mapview")) install.packages("mapview")
## Caricamento del pacchetto richiesto: mapview
if (!require("sf")) install.packages("sf")
## Caricamento del pacchetto richiesto: sf
## Linking to GEOS 3.11.2, GDAL 3.7.2, PROJ 9.3.0; sf_use_s2() is TRUE
if (!require("geojsonio")) install.packages("geojsonio")
## Caricamento del pacchetto richiesto: geojsonio
## Registered S3 method overwritten by 'geojsonsf':
## method from
## print.geojson geojson
##
## Caricamento pacchetto: 'geojsonio'
##
## Il seguente oggetto è mascherato da 'package:base':
##
## pretty
if (!require("leaflet")) install.packages("leaflet")
## Caricamento del pacchetto richiesto: leaflet
if (!require("broom")) install.packages("broom")
## Caricamento del pacchetto richiesto: broom
if (!require("plotly")) install.packages("plotly")
## Caricamento del pacchetto richiesto: plotly
##
## Caricamento pacchetto: 'plotly'
##
## Il seguente oggetto è mascherato da 'package:ggplot2':
##
## last_plot
##
## Il seguente oggetto è mascherato da 'package:MASS':
##
## select
##
## Il seguente oggetto è mascherato da 'package:stats':
##
## filter
##
## Il seguente oggetto è mascherato da 'package:graphics':
##
## layout
library(nnet)
library(MASS)
library(e1071)
library(class)
library(leaps)
library(glmnet)
library(car)
library(caTools)
library(mgcv)
library(summarytools)
library(dplyr)
library(ggplot2)
library(tidyverse)
library(lubridate)
library(mapview)
library(sf)
library(geojsonio)
library(leaflet)
library(broom)
library(plotly)
The Fire Incident Dispatch Data file contains data that is generated by the Starfire Computer Aided Dispatch System. The data spans from the time the incident is created in the system to the time the incident is closed in the system. It covers information about the incident as it relates to the assignment of resources and the Fire Department’s response to the emergency. To protect personal identifying information in accordance with the Health Insurance Portability and Accountability Act (HIPAA), specific locations of incidents are not included and have been aggregated to a higher level of detail.
In this analysis we have restricted the analysis only on the last 50.000 observations from 5th of September to 30th of the same month.
STARFIRE_INCIDENT_ID: An incident identifier comprising the 5 character julian date, 4 character alarm box number, 2 character number of incidents at the box so far for the day, 1 character borough code , 4 character sequence number.
INCIDENT_DATETIME: The date and time of the incident.
ALARM_BOX_BOROUGH: The borough of the alarm box.
ALARM_BOX_LOCATION: The location of the alarm box.
ALARM_BOX: The alarm box number.
INCIDENT_BOROUGH: The borough of the incident.
ZIPCODE: The zip code of the incident.
POLICEPRECINCT: The police precinct of the incident.
CITYCOUNCILDISTRICT: The city council district.
COMMUNITYDISTRICT: The community district.
COMMUNITYSCHOOLDISTRICT: The community school district.
CONGRESSIONALDISTRICT: The congressional district.
ALARM_SOURCE_DESCRIPTION_TX: The description of the alarm source.
ALARM_LEVEL_INDEX_DESCRIPTION: The alarm level index.
HIGHEST_ALARM_LEVEL: The highest alarm level.
INCIDENT_CLASSIFICATION: The incident classification.
INCIDENT_CLASSIFICATION_GROUP: The incident classification roll up group.
FIRST_ASSIGNMENT_DATETIME: The date and time of the first unit assignment.
FIRST_ACTIVATION_DATETIME: The date and time of the first unit acknowledgement of the assignment.
FIRST_ON_SCENE_DATETIME: The date and time of the first unit at the scene of the incident.
INCIDENT_CLOSE_DATETIME: The date and time that the incident was closed in the dispatch system.
VALID_DISPATCH_RSPNS_TIME_INDC: Indicates that the components comprising the generation of the DISPATCH_RESPONSE_SECONDS_QY are valid.
DISPATCH_RESPONSE_SECONDS_QY: The elapsed time in seconds between the incident_datetime and the first_assignment_datetime.
VALID_INCIDENT_RSPNS_TIME_INDC: Indicates that the components comprising the generation of the INCIDENT_RESPONSE_SECONDS_QY are valid.
INCIDENT_RESPONSE_SECONDS_QY: The elapsed time in seconds between the incident_datetime and the first_onscene_datetime.
INCIDENT_TRAVEL_TM_SECONDS_QY: The elapsed time in seconds between the first_assignment_datetime and the first_onscene_datetime.
ENGINES_ASSIGNED_QUANTITY: The number of engine units assigned to the incident.
LADDERS_ASSIGNED_QUANTITY: The number of ladder units assigned to the incident.
OTHER_UNITS_ASSIGNED_QUANTITY: The number of units that are not engines or ladders that were assigned to the incident.
Regarding the response we will create two different analysis one with the aim to predict the INCIDENT_RESPONSE_SECONDS_QY and the other to predict the TICKET_TIME_QY. In both models the remaining time difference predictors are removed.
The first step is always to read the dataset and plot the first 5 observations
fire_data <- read.csv("datasets/Fire_Incident_Dispatch_Data_last_50k.csv")
head(fire_data)
## STARFIRE_INCIDENT_ID INCIDENT_DATETIME ALARM_BOX_BOROUGH
## 1 230905-B1937-001-0567 09/05/2023 02:19:04 PM BROOKLYN
## 2 230905-B3923-002-0568 09/05/2023 02:19:36 PM BROOKLYN
## 3 230905-X8897-003-0480 09/05/2023 02:19:43 PM BRONX
## 4 230905-X3466-001-0481 09/05/2023 02:21:00 PM BRONX
## 5 230905-B2448-001-0570 09/05/2023 02:21:26 PM BROOKLYN
## 6 230905-B2448-002-0571 09/05/2023 02:22:35 PM BROOKLYN
## ALARM_BOX_NUMBER ALARM_BOX_LOCATION INCIDENT_BOROUGH
## 1 1937 AUTUMN AVE & FULTON ST BROOKLYN
## 2 3923 N/S EASTERN PWAY & UTICA AVE BROOKLYN
## 3 8897 CROSS BX EXPY- DEEGAN EX TO JEROME AV BRONX
## 4 3466 ADEE AVE & BX PARK E BRONX
## 5 2448 GLENWOOD RD & BEDFORD AVE BROOKLYN
## 6 2448 GLENWOOD RD & BEDFORD AVE BROOKLYN
## ZIPCODE POLICEPRECINCT CITYCOUNCILDISTRICT COMMUNITYDISTRICT
## 1 11208 75 37 305
## 2 11213 71 35 309
## 3 NA NA NA NA
## 4 10467 49 12 211
## 5 11210 70 45 314
## 6 11210 70 45 314
## COMMUNITYSCHOOLDISTRICT CONGRESSIONALDISTRICT ALARM_SOURCE_DESCRIPTION_TX
## 1 19 7 EMS
## 2 17 9 CLASS-3
## 3 NA NA EMS-911
## 4 11 15 EMS
## 5 22 9 EMS
## 6 22 9 EMS
## ALARM_LEVEL_INDEX_DESCRIPTION HIGHEST_ALARM_LEVEL
## 1 Initial Alarm First Alarm
## 2 Initial Alarm First Alarm
## 3 Initial Alarm First Alarm
## 4 DEFAULT RECORD First Alarm
## 5 DEFAULT RECORD First Alarm
## 6 DEFAULT RECORD First Alarm
## INCIDENT_CLASSIFICATION INCIDENT_CLASSIFICATION_GROUP
## 1 Medical - No PT Contact EMS is Onscene Medical Emergencies
## 2 Hospital Fire Structural Fires
## 3 Vehicle Accident - Other NonMedical Emergencies
## 4 Medical - EMS Link 10-91 Medical Emergencies
## 5 Medical - EMS Link 10-91 Medical Emergencies
## 6 Medical - EMS Link 10-91 Medical Emergencies
## DISPATCH_RESPONSE_SECONDS_QY FIRST_ASSIGNMENT_DATETIME
## 1 7 09/05/2023 02:19:12 PM
## 2 95 09/05/2023 02:21:11 PM
## 3 41 09/05/2023 02:20:25 PM
## 4 298 09/05/2023 02:25:59 PM
## 5 25 09/05/2023 02:21:52 PM
## 6 350 09/05/2023 02:28:25 PM
## FIRST_ACTIVATION_DATETIME FIRST_ON_SCENE_DATETIME INCIDENT_CLOSE_DATETIME
## 1 09/05/2023 02:19:26 PM 09/05/2023 02:25:23 PM 09/05/2023 03:03:15 PM
## 2 09/05/2023 02:21:33 PM 09/05/2023 02:23:21 PM 09/05/2023 02:34:18 PM
## 3 09/05/2023 02:20:35 PM 09/05/2023 02:26:22 PM 09/05/2023 04:13:32 PM
## 4 09/05/2023 02:26:04 PM 09/05/2023 02:34:23 PM
## 5 09/05/2023 02:22:08 PM 09/05/2023 02:28:07 PM
## 6 09/05/2023 02:29:09 PM
## VALID_DISPATCH_RSPNS_TIME_INDC VALID_INCIDENT_RSPNS_TIME_INDC
## 1 N Y
## 2 N Y
## 3 N Y
## 4 N N
## 5 N N
## 6 N N
## INCIDENT_RESPONSE_SECONDS_QY INCIDENT_TRAVEL_TM_SECONDS_QY
## 1 378 371
## 2 224 129
## 3 398 357
## 4 NA NA
## 5 NA NA
## 6 NA NA
## ENGINES_ASSIGNED_QUANTITY LADDERS_ASSIGNED_QUANTITY
## 1 1 0
## 2 3 2
## 3 2 3
## 4 1 0
## 5 1 0
## 6 1 0
## OTHER_UNITS_ASSIGNED_QUANTITY
## 1 0
## 2 1
## 3 1
## 4 0
## 5 0
## 6 0
Use dfSummary from summarytool in order to have a complete and clear sumamry of the dataset.
print(dfSummary(fire_data,
plain.ascii = FALSE,
style = "multiline",
headings = FALSE,
graph.magnif = 0.8,
valid.col = FALSE),
method = 'render')
| No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | STARFIRE_INCIDENT_ID [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 2 | INCIDENT_DATETIME [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 3 | ALARM_BOX_BOROUGH [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 4 | ALARM_BOX_NUMBER [integer] |
|
7411 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 5 | ALARM_BOX_LOCATION [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 6 | INCIDENT_BOROUGH [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 7 | ZIPCODE [integer] |
|
217 distinct values | 3181 (6.4%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 8 | POLICEPRECINCT [integer] |
|
77 distinct values | 3180 (6.4%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 9 | CITYCOUNCILDISTRICT [integer] |
|
51 distinct values | 3180 (6.4%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 10 | COMMUNITYDISTRICT [integer] |
|
70 distinct values | 3180 (6.4%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 11 | COMMUNITYSCHOOLDISTRICT [integer] |
|
32 distinct values | 3182 (6.4%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 12 | CONGRESSIONALDISTRICT [integer] |
|
13 distinct values | 3180 (6.4%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 13 | ALARM_SOURCE_DESCRIPTION_TX [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 14 | ALARM_LEVEL_INDEX_DESCRIPTION [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 15 | HIGHEST_ALARM_LEVEL [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 16 | INCIDENT_CLASSIFICATION [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 17 | INCIDENT_CLASSIFICATION_GROUP [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 18 | DISPATCH_RESPONSE_SECONDS_QY [integer] |
|
841 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 19 | FIRST_ASSIGNMENT_DATETIME [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 20 | FIRST_ACTIVATION_DATETIME [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 21 | FIRST_ON_SCENE_DATETIME [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 22 | INCIDENT_CLOSE_DATETIME [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 23 | VALID_DISPATCH_RSPNS_TIME_INDC [character] | 1. N |
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 24 | VALID_INCIDENT_RSPNS_TIME_INDC [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 25 | INCIDENT_RESPONSE_SECONDS_QY [integer] |
|
1496 distinct values | 14112 (28.2%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 26 | INCIDENT_TRAVEL_TM_SECONDS_QY [integer] |
|
1382 distinct values | 14112 (28.2%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 27 | ENGINES_ASSIGNED_QUANTITY [integer] |
|
15 distinct values | 62 (0.1%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 28 | LADDERS_ASSIGNED_QUANTITY [integer] |
|
12 distinct values | 62 (0.1%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 29 | OTHER_UNITS_ASSIGNED_QUANTITY [integer] |
|
23 distinct values | 62 (0.1%) |
Generated by summarytools 1.0.1 (R version 4.3.2)
2024-01-13
Now we rename all the columns in order to be smaller whenever we plot graphs.
fire_data <- fire_data %>%
rename(id = STARFIRE_INCIDENT_ID, datetime = INCIDENT_DATETIME, al_borough = ALARM_BOX_BOROUGH,
al_number = ALARM_BOX_NUMBER,al_location = ALARM_BOX_LOCATION, inc_borough = INCIDENT_BOROUGH,
zipcode = ZIPCODE, pol_prec = POLICEPRECINCT, city_con_dist = CITYCOUNCILDISTRICT,
commu_dist = COMMUNITYDISTRICT, commu_sc_dist = COMMUNITYSCHOOLDISTRICT,
cong_dist = CONGRESSIONALDISTRICT, al_source_desc = ALARM_SOURCE_DESCRIPTION_TX,
al_index_desc = ALARM_LEVEL_INDEX_DESCRIPTION, highest_al_level = HIGHEST_ALARM_LEVEL,
inc_class = INCIDENT_CLASSIFICATION, inc_class_group = INCIDENT_CLASSIFICATION_GROUP,
first_ass_datetime = FIRST_ASSIGNMENT_DATETIME, first_act_datetime = FIRST_ACTIVATION_DATETIME,
first_onscene_datetime = FIRST_ON_SCENE_DATETIME, inc_close_datetime = INCIDENT_CLOSE_DATETIME,
disp_resp_sec_qy = DISPATCH_RESPONSE_SECONDS_QY, disp_resp_sec_indc = VALID_DISPATCH_RSPNS_TIME_INDC,
inc_resp_sec_qy = INCIDENT_RESPONSE_SECONDS_QY, inc_resp_sec_indc = VALID_INCIDENT_RSPNS_TIME_INDC,
inc_travel_sec_qy = INCIDENT_TRAVEL_TM_SECONDS_QY,
engines_assigned = ENGINES_ASSIGNED_QUANTITY,
ladders_assigned = LADDERS_ASSIGNED_QUANTITY, others_units_assigned = OTHER_UNITS_ASSIGNED_QUANTITY)
As we can see from the summary there are many NA values, and many predictors that are as characters and not factors. In this step we will convert the characters predictors as factors merging the values that appear less in the dataset, so we do no have many values that have low frequency in our dataset.
In addition we will add he predictor for the day_number, a factorial predictor to indicate in the incident day is a week day or not dat_type and a factorial predictor time_of_day that indicates the range of time whenever the incident happens, so Night (if the hour is between 0 and 6), Morning (if the hour is between 6 and 12), Afternoon (if the hour is between 12 and 18), Evening (if the hour is between 18 and 24).
Since we are dealing with datetime we also check if the differences (inc_resp_sec_qy, inc_travel_sec_qy and disp_resp_sec_qy) are actually corrects, if not we replace them with the correct one.
Finally we decided to add an additional time difference the emergency_min_qy which represents the difference between the inc_close_datetime and the first_onscene_datetime.
# set factorial
fire_data$inc_borough <- as.factor(fire_data$inc_borough)
fire_data$al_borough <- as.factor(fire_data$al_borough)
fire_data$al_source_desc <- as.factor(fire_data$al_source_desc)
fire_data$al_index_desc <- as.factor(fire_data$al_index_desc)
fire_data$highest_al_level <- as.factor(fire_data$highest_al_level)
fire_data$disp_resp_sec_indc <- as.factor(fire_data$disp_resp_sec_indc)
levels(fire_data$disp_resp_sec_indc)<- c("N", "Y")
fire_data$inc_resp_sec_indc <- as.factor(fire_data$inc_resp_sec_indc)
levels(fire_data$inc_resp_sec_indc)<- c("N", "Y")
fire_data$inc_class_group <- as.factor(fire_data$inc_class_group)
fire_data$inc_class <- as.factor(fire_data$inc_class)
Moreover we note that the maximum level of the time differences is very high to be considered as seconds so we decided to scale the two indicators in minutes.
summary(fire_data %>% select(inc_resp_sec_qy, inc_travel_sec_qy, disp_resp_sec_qy))
## inc_resp_sec_qy inc_travel_sec_qy disp_resp_sec_qy
## Min. : 18.0 Min. : 0.0 Min. : 2.00
## 1st Qu.: 265.0 1st Qu.: 233.0 1st Qu.: 7.00
## Median : 334.0 Median : 301.0 Median : 19.00
## Mean : 380.7 Mean : 340.5 Mean : 39.96
## 3rd Qu.: 426.0 3rd Qu.: 392.0 3rd Qu.: 40.00
## Max. :7130.0 Max. :7122.0 Max. :9023.00
## NA's :14112 NA's :14112
# scaling
fire_data$inc_resp_sec_qy <- fire_data$inc_resp_sec_qy / 60
fire_data$inc_travel_sec_qy <- fire_data$inc_travel_sec_qy / 60
fire_data$disp_resp_sec_qy <- fire_data$disp_resp_sec_qy / 60
# renaming both quantity and indicator predictors for the two datetime
fire_data <- fire_data %>% rename(inc_resp_min_qy = inc_resp_sec_qy, inc_travel_min_qy = inc_travel_sec_qy, disp_resp_min_qy = disp_resp_sec_qy, # quantity
inc_resp_min_indc = inc_resp_sec_indc, disp_resp_min_indc = disp_resp_sec_indc) # indicator
Here we create the time_of_day and is_weekend
# Process datetime column
fire_data$datetime <- mdy_hms(fire_data$datetime)
fire_data$first_ass_datetime <- mdy_hms(fire_data$first_ass_datetime)
fire_data$first_act_datetime <- mdy_hms(fire_data$first_act_datetime)
fire_data$first_onscene_datetime <- mdy_hms(fire_data$first_onscene_datetime)
fire_data$inc_close_datetime <- mdy_hms(fire_data$inc_close_datetime)
# checking if the differences are well computed if not change with the correct one
if (!identical(fire_data$inc_resp_min_qy, as.numeric(difftime(fire_data$first_onscene_datetime, fire_data$datetime, units="mins")))){
fire_data$inc_resp_min_qy <- as.numeric(difftime(fire_data$first_onscene_datetime, fire_data$datetime, units="mins"))
}
if (!identical(fire_data$inc_travel_min_qy, as.numeric(difftime(fire_data$first_onscene_datetime, fire_data$first_ass_datetime, units="mins")))){
fire_data$inc_travel_min_qy <- as.numeric(difftime(fire_data$first_onscene_datetime, fire_data$first_ass_datetime, units="mins"))
}
if (!identical(fire_data$disp_resp_min_qy, as.numeric(difftime(fire_data$first_ass_datetime, fire_data$datetime, units="mins")))){
fire_data$disp_resp_min_qy <- as.numeric(difftime(fire_data$first_ass_datetime, fire_data$datetime, units="mins"))
}
# creating emergency_min_qy which describe the time taken by the firefighter to close the emergency after have been arrived to the location
fire_data$emergency_min_qy <- as.numeric(difftime(fire_data$inc_close_datetime, fire_data$first_onscene_datetime, units="mins"))
# creating day_type
fire_data$day_type <- as.factor(ifelse(weekdays(fire_data$datetime) %in% c("sabato", "domenica"), "Weekend", "Weekday"))
# creating ticket_time
fire_data$ticket_time <- as.numeric(difftime(fire_data$inc_close_datetime, fire_data$datetime, units="mins"))
# creating time_of_day
fire_data$time_of_day <- cut(
hour(fire_data$datetime),
breaks = c(0, 6, 12, 18, 24),
labels = c("Night", "Morning", "Afternoon", "Evening"),
include.lowest = TRUE,
right = TRUE
)
fire_data$datetime <- NULL
table(fire_data$time_of_day)
##
## Night Morning Afternoon Evening
## 8521 13270 16499 11710
ggplot(data=fire_data %>%
group_by(time_of_day) %>%
summarise(incident_number = n()),
aes(x=time_of_day, y=incident_number)) +
geom_bar(stat="identity", position=position_dodge()) +
geom_text(aes(label=incident_number), vjust=1.6, color="white", position = position_dodge(0.9), size=3.5) +
labs(title = "Time of the Day - Incident Count", x = "Time of the Day", y = "Incident Count") +
theme_grey()
From this we can see that the higher number of fire incident is registered from 12 PM to 18 PM, whereas the lower number of fire incident happened from the 00 AM to 06 AM.
day_type_table <- table(fire_data$day_type)
day_type_table[1] <- day_type_table[1] / 5
day_type_table[2] <- day_type_table[2] / 2
day_type_table
##
## Weekday Weekend
## 7296.4 6759.0
And in proportion we can see that on average there is an higher number of fire incident on the week day respect to the week end days.
Now regarding the assigned untis we decided to add a summary predictor that include the sum of all the three assigned units predictors.
fire_data$total_assigned_unit <- fire_data$engines_assigned + fire_data$ladders_assigned + fire_data$others_units_assigned
Rename the factor levels for the inc_borough and predictors
fire_data <- fire_data %>% mutate(inc_borough = recode_factor(
inc_borough, "BRONX" = "Bronx", "BROOKLYN" = "Brooklyn", "MANHATTAN" = "Manhattan",
"QUEENS" = "Queens", "RICHMOND / STATEN ISLAND" = "Staten Island"),
al_borough = recode_factor(
al_borough, "BRONX" = "Bronx", "BROOKLYN" = "Brooklyn", "MANHATTAN" = "Manhattan",
"QUEENS" = "Queens", "RICHMOND / STATEN ISLAND" = "Staten Island"))
At this point we merge some possible value from factorial predictors to make the space of possible choice smaller.
Here we merge the following factorial values of highest_al_level: Second Alarm and Third Alarm into 2nd-3rd Alarm.
# highest_al_level
fire_data$highest_alarm_lev_new <- fire_data$highest_al_level
levels(fire_data$highest_alarm_lev_new) <- list(
"All Hands Working" = "All Hands Working",
"First Alarm" = "First Alarm",
"2nd-3rd Alarm" = c("Second Alarm", "Third Alarm")
)
print(ctable(fire_data$highest_al_level, fire_data$highest_alarm_lev_new, prop = 'n', totals = FALSE, headings = FALSE), method = 'render')
| highest_alarm_lev_new | |||
|---|---|---|---|
| highest_al_level | All Hands Working |
First Alarm | 2nd-3rd Alarm |
| All Hands Working | 100 | 0 | 0 |
| First Alarm | 0 | 49891 | 0 |
| Second Alarm | 0 | 0 | 8 |
| Third Alarm | 0 | 0 | 1 |
Generated by summarytools 1.0.1 (R version 4.3.2)
2024-01-13
fire_data$highest_al_level <- fire_data$highest_alarm_lev_new
fire_data$highest_alarm_lev_new <- NULL
Here we merge the following factorial values of al_index_desc: Second Alarm, Third Alarm, 7-5 (All Hands Alarm), 10-76 & 10-77 Signal (Notification Hi-Rise Fire) and 10-75 Signal (Request for all hands alarm) into Others.
# al_index_desc
fire_data$alarm_level_idx_new <- fire_data$al_index_desc
levels(fire_data$alarm_level_idx_new) <- list(
"DEFAULT RECORD" = "DEFAULT RECORD",
"Initial Alarm" = "Initial Alarm",
"Others" = c("Second Alarm", "Third Alarm", "7-5 (All Hands Alarm)",
"10-76 & 10-77 Signal (Notification Hi-Rise Fire)",
"10-75 Signal (Request for all hands alarm)")
)
print(ctable(fire_data$al_index_desc, fire_data$alarm_level_idx_new, prop = 'n', totals = FALSE, headings = FALSE), method = 'render')
| alarm_level_idx_new | |||
|---|---|---|---|
| al_index_desc | DEFAULT RECORD |
Initial Alarm |
Others |
| 10-75 Signal (Request for all hands alarm) | 0 | 0 | 13 |
| 10-76 & 10-77 Signal (Notification Hi-Rise Fire) | 0 | 0 | 3 |
| 7-5 (All Hands Alarm) | 0 | 0 | 100 |
| DEFAULT RECORD | 17313 | 0 | 0 |
| Initial Alarm | 0 | 32562 | 0 |
| Second Alarm | 0 | 0 | 8 |
| Third Alarm | 0 | 0 | 1 |
Generated by summarytools 1.0.1 (R version 4.3.2)
2024-01-13
fire_data$al_index_desc <- fire_data$alarm_level_idx_new
fire_data$alarm_level_idx_new <- NULL
Here we merge the following factorial values of al_source_desc: 911, 911TEXT, VERBAL, BARS, ERS, ERS-NC and SOL into Others.
fire_data$alarm_source_desc_new <- fire_data$al_source_desc
levels(fire_data$alarm_source_desc_new) <- list(
"PHONE" = "PHONE",
"EMS" = "EMS",
"EMS-911" = "EMS-911",
"CLASS-3" = "CLASS-3",
"Others" = c("911", "911TEXT", "VERBAL", "BARS", "ERS", "ERS-NC", "SOL")
)
print(ctable(fire_data$al_source_desc, fire_data$alarm_source_desc_new, prop = 'n', totals = FALSE, headings = FALSE), method = 'render')
| alarm_source_desc_new | |||||
|---|---|---|---|---|---|
| al_source_desc | PHONE | EMS | EMS-911 | CLASS-3 | Others |
| 911 | 0 | 0 | 0 | 0 | 302 |
| 911TEXT | 0 | 0 | 0 | 0 | 14 |
| BARS | 0 | 0 | 0 | 0 | 1 |
| CLASS-3 | 0 | 0 | 0 | 5025 | 0 |
| EMS | 0 | 17178 | 0 | 0 | 0 |
| EMS-911 | 0 | 0 | 10520 | 0 | 0 |
| ERS | 0 | 0 | 0 | 0 | 777 |
| ERS-NC | 0 | 0 | 0 | 0 | 1 |
| PHONE | 15146 | 0 | 0 | 0 | 0 |
| SOL | 0 | 0 | 0 | 0 | 5 |
| VERBAL | 0 | 0 | 0 | 0 | 1031 |
Generated by summarytools 1.0.1 (R version 4.3.2)
2024-01-13
fire_data$al_source_desc <- fire_data$alarm_source_desc_new
fire_data$alarm_source_desc_new <- NULL
View again the dataset summary to see the applied changes.
print(dfSummary(fire_data,
plain.ascii = FALSE,
style = "multiline",
headings = FALSE,
graph.magnif = 0.8,
valid.col = FALSE),
method = 'render')
| No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | id [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 2 | al_borough [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 3 | al_number [integer] |
|
7411 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 4 | al_location [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 5 | inc_borough [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 6 | zipcode [integer] |
|
217 distinct values | 3181 (6.4%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 7 | pol_prec [integer] |
|
77 distinct values | 3180 (6.4%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 8 | city_con_dist [integer] |
|
51 distinct values | 3180 (6.4%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 9 | commu_dist [integer] |
|
70 distinct values | 3180 (6.4%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 10 | commu_sc_dist [integer] |
|
32 distinct values | 3182 (6.4%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 11 | cong_dist [integer] |
|
13 distinct values | 3180 (6.4%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 12 | al_source_desc [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 13 | al_index_desc [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 14 | highest_al_level [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 15 | inc_class [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 16 | inc_class_group [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 17 | disp_resp_min_qy [numeric] |
|
850 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 18 | first_ass_datetime [POSIXct, POSIXt] |
|
49509 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 19 | first_act_datetime [POSIXct, POSIXt] |
|
49205 distinct values | 139 (0.3%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 20 | first_onscene_datetime [POSIXct, POSIXt] |
|
35552 distinct values | 14112 (28.2%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 21 | inc_close_datetime [POSIXct, POSIXt] |
|
49409 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 22 | disp_resp_min_indc [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 23 | inc_resp_min_indc [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 24 | inc_resp_min_qy [numeric] |
|
1497 distinct values | 14112 (28.2%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 25 | inc_travel_min_qy [numeric] |
|
1369 distinct values | 14112 (28.2%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 26 | engines_assigned [integer] |
|
15 distinct values | 62 (0.1%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 27 | ladders_assigned [integer] |
|
12 distinct values | 62 (0.1%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 28 | others_units_assigned [integer] |
|
23 distinct values | 62 (0.1%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 29 | emergency_min_qy [numeric] |
|
4429 distinct values | 14112 (28.2%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 30 | day_type [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 31 | ticket_time [numeric] |
|
4870 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 32 | time_of_day [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 33 | total_assigned_unit [integer] |
|
35 distinct values | 62 (0.1%) |
Generated by summarytools 1.0.1 (R version 4.3.2)
2024-01-13
The next step is to deal invalid values and delete some un-useful predictors.
First of all we saw the possibility that al_borough and inc_borough represent the same column, let’s chek it.
identical(fire_data$al_borough, fire_data$inc_borough)
## [1] TRUE
The column `al_borough and inc_borough have the same sequence of values, so we can delete one of the two.
fire_data <- fire_data %>% select(-c(al_borough))
Then we say that all observation in the dataset have the disp_resp_min_indc equal to N, let’s check again and in affermative case then we can delete both columns.
summary(fire_data$disp_resp_min_indc)
## N Y
## 50000 0
All our observations have non valid disp_resp_min_indc so we could delete both the column indicator and the respective column quantity disp_resp_min_qy. However we note that also in the original dataset all the observation have the disp_resp_min_indc set to N, which is quite strange, and seems that is problem relative to the data acquisition, thus we decide to mantein this time difference.
fire_data <- fire_data %>% select(-c(disp_resp_min_indc))
Now we do a quick check also on the other indicator variable inc_resp_min_indc
summary(fire_data$inc_resp_min_indc)
## N Y
## 17036 32964
But here we have some observations with valid inc_resp_min_indc, and we will consider only the valid one deleting the one that has a non valid attribute.
However before doing that let’s see the distribution of inc_resp_min_qy around the borough.
ggplot(data=fire_data %>% group_by(inc_borough, inc_resp_min_indc) %>% summarise(incident_number = n()),
aes(x=inc_borough, y=incident_number, fill=inc_resp_min_indc)) +
geom_bar(stat="identity", position=position_dodge()) +
geom_text(aes(label=incident_number), vjust=1.6, color="white",
position = position_dodge(0.9), size=3.5) +
scale_fill_brewer(palette="Paired") +
labs(title = "Incident Count - Borouh - Valid Response Time in Minutes", x = "Borough", y = "Incident Number", fill = "Valid Response\n Time in Minutes") +
theme_gray()
## `summarise()` has grouped output by 'inc_borough'. You can override using the
## `.groups` argument.
We can see that the number of fire incident is higher for the valid response time in minutes but, it is much interesting observe the rateo between the valid and the non valid.
And to the rateo of valid inc_resp_min_indc in each borough is:
rateo_inc_resp_min_indc <- fire_data %>%
group_by(inc_borough, inc_resp_min_indc) %>%
summarise(incident_number = n()) %>%
mutate(ratio=incident_number/sum(incident_number))
## `summarise()` has grouped output by 'inc_borough'. You can override using the
## `.groups` argument.
ggplot(rateo_inc_resp_min_indc, aes(fill=inc_resp_min_indc, y=ratio, x=inc_borough)) +
geom_bar(position="fill", stat="identity") +
geom_text(aes(label=scales::percent(ratio)), position=position_fill(vjust=0.5)) +
labs(title="Borough - Rateo Incident between Valid and Invalid",
x="Borough",
y="Rateo Incident between Valid and Invalid",
fill="Valid Response\nTime in Minutes")
And we can see that Staten Island has the higher number of incidents with valid inc_resp_min_indc , whereas Manhattan has the lower number, but remember that the former has the lowest number of fire incident and the latter has the higher number of incident.
Now we do an additional analysis to see if there is some find of relation between the inc_resp_min_indc and total_assigned_unit.
ggplot(fire_data, aes(total_assigned_unit, inc_resp_min_qy)) +
geom_point(aes(colour = inc_resp_min_indc))+
labs(title = "Total Assigned Units - Response Time In Minutes", x = "Total Assigned Units", y = "Response Time In Minutes", colour = "Valid Response\n Time in Minutes") +
theme_gray()
## Warning: Removed 14116 rows containing missing values (`geom_point()`).
We note that the majority of fire incident that had been assigned a single units has a high response time and the relative measure is not valid. Whereas for an higher number of total units the response time decrease and becomes valids.
ggplot(fire_data %>% filter(inc_resp_min_indc == "N")
, aes(total_assigned_unit, inc_resp_min_qy)) +
geom_point(aes(colour = inc_class_group)) +
labs(title = "Total Assigned Units - Response Time In Minutes - Incidnet Class Group", x = "Total Assigned Units", y = "Response Time In Minutes", colour = "Incident Class Groups") +
theme_gray()
## Warning: Removed 14112 rows containing missing values (`geom_point()`).
Regarding the incident class group around all the incidents with invalid response time had been assigned a single units as we discussed before, but in addition we found that are from the Medical Emergencies, whereas almost all the other incidents are from the NonMedical Emergencies.
# add an additional predictor
fire_data$tua_is_one <- as.factor(ifelse(fire_data$total_assigned_unit == 1, "Y", "N"))
tua_is_one <- fire_data %>%
filter(inc_resp_min_indc == "N", inc_class_group == "Medical Emergencies") %>%
group_by(inc_borough, tua_is_one) %>%
summarise(incident_number = n())
## `summarise()` has grouped output by 'inc_borough'. You can override using the
## `.groups` argument.
ggplot(data=tua_is_one,
aes(x=inc_borough, y=incident_number, fill=tua_is_one)) +
geom_bar(stat="identity", position=position_dodge()) +
geom_text(aes(label=incident_number), vjust=1.5, color="black",
position = position_dodge(0.9), size=3.5) +
scale_fill_brewer(palette="Set1") +
labs(title = "Total Assigned Units One or Not", x = "Borough", y = "Incident Count", fill = "Total Assigned\nUnits are One") +
theme_gray()
We have also added an additional factorial predictor tua_is_one to indicates if the total assigned units is equal to one or not.
Continuing we decide to analyse the type of Incident Class of the invalid incidents response time that had been assigned a single total units.
ggplot(data=fire_data %>%
filter(inc_resp_min_indc == "N", inc_class_group == "Medical Emergencies", tua_is_one == "Y") %>%
group_by(inc_class, inc_borough) %>%
summarise(incident_number = n()),
aes(x=inc_borough, y=incident_number, fill=inc_class)) +
geom_bar(stat="identity", position=position_dodge()) +
geom_text(aes(label=incident_number), vjust=1.6, color="black",
position = position_dodge(0.9), size=3) +
scale_fill_brewer(palette="Set1") +
labs(title = "Borough - Incident Counts - Incident Class -- for Total Assigned Units equal to 1", x = "Borough", y = "Incident Counts", fill = "Incident Class Group") +
theme_grey()
## `summarise()` has grouped output by 'inc_class'. You can override using the
## `.groups` argument.
And we found that the majority of the incident that respect these circumstances are mostly identified as Medical - EMS Link 10-91 and Medical - PD Link 10-91.
Thanks to the 10code site we found a description of the two emergency codes:
10-91 Medical Emergency EMS - Fire Unit Not Required - To be transmitted through borough dispatcher by the responding unit when the fire Unit is canceled enroute due to EMS on scene, or EMS downgrades the job to a segment that does not require a Fire Unit response. Note: This signal shall be used only for medical emergency incidents. EMS we are sure that stands for Emergency Medical Services.
10-91 Medical Emergency PD - Fire Unit Not Required - To be transmitted through borough dispatcher by the responding unit when the fire Unit is canceled enroute due to PD on scene, or PD downgrades the job to a segment that does not require a Fire Unit response. Note: This signal shall be used only for medical emergency incidents. PD we think that stands for Police Department.
Now we can look for the NonMedical Emergencies by first see the distribution of its incident class.
print(fire_data %>%
filter(inc_resp_min_indc == "N", inc_class_group == "NonMedical Emergencies") %>%
group_by(inc_class) %>%
summarise(incident_number = n()))
## # A tibble: 24 × 2
## inc_class incident_number
## <fct> <int>
## 1 Alarm System - Defective 10
## 2 Alarm System - Testing 22
## 3 Alarm System - Unnecessary 110
## 4 Assist Civilian - Non-Medical 828
## 5 Carbon Monoxide - Code 1 - Investigation 25
## 6 Carbon Monoxide - Code 2 - Incident (1-9 ppm) 4
## 7 Carbon Monoxide - Code 3 - Emergency (over 9 ppm) 4
## 8 Defective Oil Burner 5
## 9 Downed Tree 28
## 10 Elevator Emergency - Occupied 104
## # ℹ 14 more rows
ggplot(data=fire_data %>%
filter(inc_resp_min_indc == "N", inc_class_group == "NonMedical Emergencies", inc_class == "Assist Civilian - Non-Medical") %>%
group_by(inc_borough) %>%
summarise(incident_number = n()),
aes(x=inc_borough, y=incident_number)) +
geom_bar(stat="identity", position=position_dodge()) +
geom_text(aes(label=incident_number), vjust=1.6, color="white", position = position_dodge(0.9), size=3.5) +
#scale_fill_brewer(palette="Paired") +
labs(title = "Incident Count - Borouh - Valid Response Time in Second", x = "Borough", y = "Incident Count") +
theme_grey()
And we found that the majority of non valid inc_resp_min_indc that are Non-Medical Emergency are from the incident class equal to Assist Civilian - Non-Medical.
For stake of consistency we will consider only the valid observations that have inc_resp_min_indc == "Y".
fire_data <- fire_data %>% filter(inc_resp_min_indc == "Y")
dim(fire_data)
## [1] 32964 32
Now we want to know how many inc_class are summarized in each inc_class_group, to be sure that each inc_class_group is referred to a single inc_class.
print(ctable(fire_data$inc_class, fire_data$inc_class_group, totals = FALSE, headings = FALSE), method = 'render')
| inc_class_group | ||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| inc_class | Medical Emergencies |
Medical MFAs | NonMedical Emergencies |
NonMedical MFAs |
NonStructura l Fires |
Structural Fires |
||||||||||||||||||
| Abandoned Derelict Vehicle Fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 6 | ( | 100.0% | ) | 0 | ( | 0.0% | ) |
| Alarm System - Defective | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 377 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Alarm System - Testing | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 706 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Alarm System - Unnecessary | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 2735 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Assist Civilian - Non-Medical | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 3312 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Automobile Fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 101 | ( | 100.0% | ) | 0 | ( | 0.0% | ) |
| Brush Fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 24 | ( | 100.0% | ) | 0 | ( | 0.0% | ) |
| Carbon Monoxide - Code 1 - Investigation | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 788 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Carbon Monoxide - Code 2 - Incident (1-9 ppm) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 129 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Carbon Monoxide - Code 3 - Emergency (over 9 ppm) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 88 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Carbon Monoxide - Code 4 - No Detector Activation | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 8 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Church Fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 10 | ( | 100.0% | ) |
| Defective Oil Burner | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 34 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Demolition Debris or Rubbish Fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 272 | ( | 100.0% | ) | 0 | ( | 0.0% | ) |
| Downed Tree | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 280 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Elevator Emergency - Occupied | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 1850 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Elevator Emergency - Unoccupied | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 708 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Factory Fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 1 | ( | 100.0% | ) |
| Hospital Fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 18 | ( | 100.0% | ) |
| Manhole Fire - Blown Cover | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 9 | ( | 100.0% | ) | 0 | ( | 0.0% | ) |
| Manhole Fire - Other | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 55 | ( | 100.0% | ) | 0 | ( | 0.0% | ) |
| Manhole Fire - Seeping Smoke | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 104 | ( | 100.0% | ) | 0 | ( | 0.0% | ) |
| Maritime Emergency | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Maritime Fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Medical - Assist Civilian | 27 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Medical - Breathing / Ill or Sick | 4779 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Medical - EMS Link 10-91 | 1096 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Medical - No PT Contact EMS is Onscene | 4285 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Medical - PD Link 10-91 | 868 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Medical - Serious Life Threatening | 366 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Medical - Victim Deceased | 287 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Medical MFA - EMS Link | 0 | ( | 0.0% | ) | 87 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Medical MFA - PD Link | 0 | ( | 0.0% | ) | 77 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Multiple Dwelling 'A' - Compactor fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 4 | ( | 100.0% | ) |
| Multiple Dwelling 'A' - Food on the stove fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 519 | ( | 100.0% | ) |
| Multiple Dwelling 'A' - Other fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 168 | ( | 100.0% | ) |
| Multiple Dwelling 'B' Fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 85 | ( | 100.0% | ) |
| Non-Medical 10-91 (Unnecessary Alarm) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 102 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Non-Medical MFA - ERS | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 586 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Non-Medical MFA - ERS No Contact | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 1 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Non-Medical MFA - Phone | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 701 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Non-Medical MFA - Private Fire Alarm | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 223 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Non-Medical MFA - Verbal | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 7 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Odor - Other Smoke | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 166 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Odor - Other Than Smoke | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 1317 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Other Commercial Building Fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 184 | ( | 100.0% | ) |
| Other Public Building Fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 4 | ( | 100.0% | ) |
| Other Transportation Fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 14 | ( | 100.0% | ) | 0 | ( | 0.0% | ) |
| Private Dwelling Fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 412 | ( | 100.0% | ) |
| Remove Civilian - Non-Fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 27 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| School Fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 31 | ( | 100.0% | ) |
| Sprinkler System - Activated | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 6 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Sprinkler System - Malfunction | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 41 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Sprinkler System - Working on System | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 28 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Store Fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 9 | ( | 100.0% | ) |
| Transit System - NonStructural | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 59 | ( | 100.0% | ) | 0 | ( | 0.0% | ) |
| Transit System - Structural | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 1 | ( | 100.0% | ) |
| Transit System Emergency | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 18 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Undefined Emergency | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 71 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Under Contruction / Vacant Fire | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 1 | ( | 100.0% | ) |
| Utility Emergency - Electric | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 595 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Utility Emergency - Gas | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 1335 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Utility Emergency - Steam | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 137 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Utility Emergency - Undefined | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 4 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Utility Emergency - Water | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 1157 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Vehicle Accident - Other | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 1443 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
| Vehicle Accident - With Extrication | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 21 | ( | 100.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) | 0 | ( | 0.0% | ) |
Generated by summarytools 1.0.1 (R version 4.3.2)
2024-01-13
As we can see from the upper table all the inc_class_group have a unique set of values.
At this point to be more clear we display each main class with each respective sub-class.
for (variable in levels(fire_data$inc_class_group)) {
non_zero_table <- table(subset(fire_data, inc_class_group == variable)$inc_class)
cat(variable, "\n")
print(non_zero_table[non_zero_table != 0])
cat("\n")
}
## Medical Emergencies
##
## Medical - Assist Civilian Medical - Breathing / Ill or Sick
## 27 4779
## Medical - EMS Link 10-91 Medical - No PT Contact EMS is Onscene
## 1096 4285
## Medical - PD Link 10-91 Medical - Serious Life Threatening
## 868 366
## Medical - Victim Deceased
## 287
##
## Medical MFAs
##
## Medical MFA - EMS Link Medical MFA - PD Link
## 87 77
##
## NonMedical Emergencies
##
## Alarm System - Defective
## 377
## Alarm System - Testing
## 706
## Alarm System - Unnecessary
## 2735
## Assist Civilian - Non-Medical
## 3312
## Carbon Monoxide - Code 1 - Investigation
## 788
## Carbon Monoxide - Code 2 - Incident (1-9 ppm)
## 129
## Carbon Monoxide - Code 3 - Emergency (over 9 ppm)
## 88
## Carbon Monoxide - Code 4 - No Detector Activation
## 8
## Defective Oil Burner
## 34
## Downed Tree
## 280
## Elevator Emergency - Occupied
## 1850
## Elevator Emergency - Unoccupied
## 708
## Non-Medical 10-91 (Unnecessary Alarm)
## 102
## Odor - Other Smoke
## 166
## Odor - Other Than Smoke
## 1317
## Remove Civilian - Non-Fire
## 27
## Sprinkler System - Activated
## 6
## Sprinkler System - Malfunction
## 41
## Sprinkler System - Working on System
## 28
## Transit System Emergency
## 18
## Undefined Emergency
## 71
## Utility Emergency - Electric
## 595
## Utility Emergency - Gas
## 1335
## Utility Emergency - Steam
## 137
## Utility Emergency - Undefined
## 4
## Utility Emergency - Water
## 1157
## Vehicle Accident - Other
## 1443
## Vehicle Accident - With Extrication
## 21
##
## NonMedical MFAs
##
## Non-Medical MFA - ERS Non-Medical MFA - ERS No Contact
## 586 1
## Non-Medical MFA - Phone Non-Medical MFA - Private Fire Alarm
## 701 223
## Non-Medical MFA - Verbal
## 7
##
## NonStructural Fires
##
## Abandoned Derelict Vehicle Fire Automobile Fire
## 6 101
## Brush Fire Demolition Debris or Rubbish Fire
## 24 272
## Manhole Fire - Blown Cover Manhole Fire - Other
## 9 55
## Manhole Fire - Seeping Smoke Other Transportation Fire
## 104 14
## Transit System - NonStructural
## 59
##
## Structural Fires
##
## Church Fire
## 10
## Factory Fire
## 1
## Hospital Fire
## 18
## Multiple Dwelling 'A' - Compactor fire
## 4
## Multiple Dwelling 'A' - Food on the stove fire
## 519
## Multiple Dwelling 'A' - Other fire
## 168
## Multiple Dwelling 'B' Fire
## 85
## Other Commercial Building Fire
## 184
## Other Public Building Fire
## 4
## Private Dwelling Fire
## 412
## School Fire
## 31
## Store Fire
## 9
## Transit System - Structural
## 1
## Under Contruction / Vacant Fire
## 1
At this point is essential to deal with NA values, trying to find the presence of possible relation with predictors. First things first let’s recap the number of NA values for each columns that we have at the moment.
colSums(is.na(fire_data))
## id al_number al_location
## 0 0 0
## inc_borough zipcode pol_prec
## 0 2197 2197
## city_con_dist commu_dist commu_sc_dist
## 2197 2197 2198
## cong_dist al_source_desc al_index_desc
## 2197 0 0
## highest_al_level inc_class inc_class_group
## 0 0 0
## disp_resp_min_qy first_ass_datetime first_act_datetime
## 0 0 41
## first_onscene_datetime inc_close_datetime inc_resp_min_indc
## 0 0 0
## inc_resp_min_qy inc_travel_min_qy engines_assigned
## 0 0 4
## ladders_assigned others_units_assigned emergency_min_qy
## 4 4 0
## day_type ticket_time time_of_day
## 0 0 0
## total_assigned_unit tua_is_one
## 4 4
Here we will check if there is a pattern on the absence of values in the following predictors: zipcode, pol_prec, city_con_dist, commu_dist, commu_sc_dist and cong_dist.
na_locations <- fire_data %>%
filter(is.na(zipcode) | is.na(pol_prec) | is.na(city_con_dist) | is.na(commu_dist) | is.na(commu_sc_dist) | is.na(cong_dist))
ggplot(data=na_locations %>%
group_by(inc_class_group, inc_borough) %>%
summarise(incident_number = n()),
aes(x=inc_borough, y=incident_number, fill=inc_class_group)) + geom_bar(stat="identity", position=position_dodge()) +
geom_text(aes(label=incident_number), vjust=1.6, color="black",
position = position_dodge(0.9), size=3.5) +
#scale_fill_brewer(palette="Paired") +
labs(title = "NA location", x = "Borough", y = "Incident Count", fill = "Incident Class Group") +
theme_grey()
## `summarise()` has grouped output by 'inc_class_group'. You can override using
## the `.groups` argument.
By the Bar Chart we note that the majority of observations that have at least one of the location predictors to NA are of the incident class group NonMedical Emergency, Non Medical MFAs and Medical Emergencies
table(na_locations$inc_borough) / table(fire_data$inc_borough)
##
## Bronx Brooklyn Manhattan Queens Staten Island
## 0.07104538 0.04694547 0.07662157 0.07700328 0.07523697
table(na_locations$inc_class_group) / table(fire_data$inc_class_group)
##
## Medical Emergencies Medical MFAs NonMedical Emergencies
## 0.03988726 0.10975610 0.05691243
## NonMedical MFAs NonStructural Fires Structural Fires
## 0.40447958 0.14906832 0.00552868
Moreover around the 40% of the whole incident that are of the incident class group NonMedical MFAs have at least one of the location columns to NA. Let’s investigate.
fd_nm_mfa_cl <- table(subset(fire_data, inc_class_group == "NonMedical MFAs")$inc_class)
fd_nm_mfa_bro <- table(subset(fire_data, inc_class_group == "NonMedical MFAs")$inc_borough)
fd_nm_mfa_cl <- fd_nm_mfa_cl[fd_nm_mfa_cl != 0]
fd_nm_mfa_cl
##
## Non-Medical MFA - ERS Non-Medical MFA - ERS No Contact
## 586 1
## Non-Medical MFA - Phone Non-Medical MFA - Private Fire Alarm
## 701 223
## Non-Medical MFA - Verbal
## 7
In the original dataset this is the distribution of inc_class for the NonMedical MF
na_nm_mfa_cl <- table(subset(na_locations, inc_class_group == "NonMedical MFAs")$inc_class)
na_nm_mfa_bro <- table(subset(na_locations, inc_class_group == "NonMedical MFAs")$inc_borough)
na_nm_mfa_cl <- na_nm_mfa_cl[names(fd_nm_mfa_cl)]
na_nm_mfa_cl
##
## Non-Medical MFA - ERS Non-Medical MFA - ERS No Contact
## 573 1
## Non-Medical MFA - Phone Non-Medical MFA - Private Fire Alarm
## 38 2
## Non-Medical MFA - Verbal
## 0
na_nm_mfa_cl / fd_nm_mfa_cl
##
## Non-Medical MFA - ERS Non-Medical MFA - ERS No Contact
## 0.97781570 1.00000000
## Non-Medical MFA - Phone Non-Medical MFA - Private Fire Alarm
## 0.05420827 0.00896861
## Non-Medical MFA - Verbal
## 0.00000000
So the 97% of all the Non-Medical MFA - ERS observations in the entire dataset have one of the location attribute equal to NA
na_nm_mfa_bro / fd_nm_mfa_bro
##
## Bronx Brooklyn Manhattan Queens Staten Island
## 0.4676056 0.3075221 0.3894472 0.3733333 0.7954545
And from here we can see that about the 78% of the observations that are NonMedical - MFAs that have at least one district column attribute to NA are from the RICHMOND / STATEN ISLAND. Also BRONX has about half of the NonMedical - MFAs observations that have at least one district column to NA.
print(fire_data %>%
filter(is.na(engines_assigned) | is.na(ladders_assigned) | is.na(others_units_assigned)) %>%
group_by(inc_borough, inc_class)) %>%
summarise(incident_count = n())
## # A tibble: 4 × 32
## # Groups: inc_borough, inc_class [4]
## id al_number al_location inc_borough zipcode pol_prec city_con_dist
## <chr> <int> <chr> <fct> <int> <int> <int>
## 1 230905-Q4545… 4545 53 AVE & 6… Queens 11378 104 30
## 2 230914-Q1014… 1014 CENTRAL AV… Queens 11691 101 31
## 3 230918-Q9643… 9643 JAMAICA AV… Queens 11418 102 29
## 4 230919-M0684… 684 1 AVE & E … Manhattan 10016 13 4
## # ℹ 25 more variables: commu_dist <int>, commu_sc_dist <int>, cong_dist <int>,
## # al_source_desc <fct>, al_index_desc <fct>, highest_al_level <fct>,
## # inc_class <fct>, inc_class_group <fct>, disp_resp_min_qy <dbl>,
## # first_ass_datetime <dttm>, first_act_datetime <dttm>,
## # first_onscene_datetime <dttm>, inc_close_datetime <dttm>,
## # inc_resp_min_indc <fct>, inc_resp_min_qy <dbl>, inc_travel_min_qy <dbl>,
## # engines_assigned <int>, ladders_assigned <int>, …
## `summarise()` has grouped output by 'inc_borough'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 3
## # Groups: inc_borough [2]
## inc_borough inc_class incident_count
## <fct> <fct> <int>
## 1 Manhattan Vehicle Accident - Other 1
## 2 Queens Assist Civilian - Non-Medical 1
## 3 Queens Medical - No PT Contact EMS is Onscene 1
## 4 Queens Medical - PD Link 10-91 1
We can easily remove this observations.
na_first_act_datetime <- fire_data %>% filter(is.na(first_act_datetime))
print(na_first_act_datetime %>% group_by(inc_class, inc_borough) %>% summarise(incident_count = n()))
## `summarise()` has grouped output by 'inc_class'. You can override using the
## `.groups` argument.
## # A tibble: 27 × 3
## # Groups: inc_class [15]
## inc_class inc_borough incident_count
## <fct> <fct> <int>
## 1 Alarm System - Unnecessary Brooklyn 2
## 2 Assist Civilian - Non-Medical Bronx 1
## 3 Assist Civilian - Non-Medical Brooklyn 4
## 4 Assist Civilian - Non-Medical Queens 1
## 5 Demolition Debris or Rubbish Fire Brooklyn 1
## 6 Downed Tree Manhattan 1
## 7 Downed Tree Queens 1
## 8 Downed Tree Staten Island 1
## 9 Elevator Emergency - Occupied Brooklyn 1
## 10 Elevator Emergency - Occupied Manhattan 1
## # ℹ 17 more rows
ggplot(data=na_first_act_datetime %>%
group_by(inc_class_group, inc_borough) %>%
summarise(incident_number = n()),
aes(x=inc_borough, y=incident_number, fill=inc_class_group)) + geom_bar(stat="identity", position=position_dodge()) +
labs(title = "NA First Act Date", x = "Borough", y = "Incident Count", fill = "Incident Class Group") +
theme_minimal()
## `summarise()` has grouped output by 'inc_class_group'. You can override using
## the `.groups` argument.
Seems to be random and thus there is no pattern that motivate the presence of NA values in first_act_datetime.
At this point we can omit the NA values.
fire_data_new <- na.omit(fire_data)
And the un-usefull predictors
fire_data_new <- fire_data_new %>% select(-c(zipcode, pol_prec, city_con_dist, commu_dist, al_location,
commu_sc_dist, cong_dist, first_ass_datetime, first_act_datetime,
first_onscene_datetime, inc_close_datetime, inc_resp_min_indc,
id, al_number, inc_class,))
#tua_is_one, total_assigned_unit))
print(dfSummary(fire_data,
plain.ascii = FALSE,
style = "multiline",
headings = FALSE,
graph.magnif = 0.8,
valid.col = FALSE),
method = 'render')
| No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing | |||||||||||||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | id [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 2 | al_number [integer] |
|
6836 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 3 | al_location [character] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 4 | inc_borough [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 5 | zipcode [integer] |
|
210 distinct values | 2197 (6.7%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 6 | pol_prec [integer] |
|
77 distinct values | 2197 (6.7%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 7 | city_con_dist [integer] |
|
51 distinct values | 2197 (6.7%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 8 | commu_dist [integer] |
|
68 distinct values | 2197 (6.7%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 9 | commu_sc_dist [integer] |
|
32 distinct values | 2198 (6.7%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 10 | cong_dist [integer] |
|
13 distinct values | 2197 (6.7%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 11 | al_source_desc [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 12 | al_index_desc [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 13 | highest_al_level [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 14 | inc_class [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 15 | inc_class_group [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 16 | disp_resp_min_qy [numeric] |
|
198 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 17 | first_ass_datetime [POSIXct, POSIXt] |
|
32747 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 18 | first_act_datetime [POSIXct, POSIXt] |
|
32617 distinct values | 41 (0.1%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 19 | first_onscene_datetime [POSIXct, POSIXt] |
|
32679 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 20 | inc_close_datetime [POSIXct, POSIXt] |
|
32681 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 21 | inc_resp_min_indc [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 22 | inc_resp_min_qy [numeric] |
|
1181 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 23 | inc_travel_min_qy [numeric] |
|
1157 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 24 | engines_assigned [integer] |
|
15 distinct values | 4 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 25 | ladders_assigned [integer] |
|
12 distinct values | 4 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 26 | others_units_assigned [integer] |
|
22 distinct values | 4 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 27 | emergency_min_qy [numeric] |
|
4170 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 28 | day_type [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 29 | ticket_time [numeric] |
|
4389 distinct values | 0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 30 | time_of_day [factor] |
|
|
0 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 31 | total_assigned_unit [integer] |
|
34 distinct values | 4 (0.0%) | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||
| 32 | tua_is_one [factor] |
|
|
4 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.3.2)
2024-01-13
In this section we will have a look on additional data visualisation in order to better understand how the predictors behaves.
summary(fire_data_new)
## inc_borough al_source_desc al_index_desc
## Bronx :6406 PHONE :13465 DEFAULT RECORD: 2979
## Brooklyn :9280 EMS : 7369 Initial Alarm :27631
## Manhattan :7294 EMS-911: 4349 Others : 118
## Queens :6188 CLASS-3: 4817
## Staten Island:1560 Others : 728
##
## highest_al_level inc_class_group disp_resp_min_qy
## All Hands Working: 94 Medical Emergencies :11222 Min. : 0.03333
## First Alarm :30626 Medical MFAs : 146 1st Qu.: 0.13333
## 2nd-3rd Alarm : 8 NonMedical Emergencies:16471 Median : 0.48333
## NonMedical MFAs : 904 Mean : 0.50062
## NonStructural Fires : 547 3rd Qu.: 0.71667
## Structural Fires : 1438 Max. :34.73333
## inc_resp_min_qy inc_travel_min_qy engines_assigned ladders_assigned
## Min. : 0.350 Min. : 0.000 Min. : 0.000 Min. : 0.0000
## 1st Qu.: 4.383 1st Qu.: 3.883 1st Qu.: 1.000 1st Qu.: 0.0000
## Median : 5.467 Median : 4.983 Median : 1.000 Median : 1.0000
## Mean : 5.949 Mean : 5.448 Mean : 1.146 Mean : 0.7795
## 3rd Qu.: 6.850 3rd Qu.: 6.367 3rd Qu.: 1.000 3rd Qu.: 1.0000
## Max. :58.917 Max. :58.617 Max. :19.000 Max. :15.0000
## others_units_assigned emergency_min_qy day_type ticket_time
## Min. : 0.0000 Min. : 0.00 Weekday:22379 Min. : 1.083
## 1st Qu.: 0.0000 1st Qu.: 7.15 Weekend: 8349 1st Qu.: 12.850
## Median : 0.0000 Median : 11.93 Median : 17.900
## Mean : 0.3899 Mean : 17.08 Mean : 23.027
## 3rd Qu.: 1.0000 3rd Qu.: 19.58 3rd Qu.: 25.954
## Max. :32.0000 Max. :596.38 Max. :601.717
## time_of_day total_assigned_unit tua_is_one
## Night : 4487 Min. : 1.000 N:12940
## Morning : 8370 1st Qu.: 1.000 Y:17788
## Afternoon:10627 Median : 1.000
## Evening : 7244 Mean : 2.316
## 3rd Qu.: 3.000
## Max. :66.000
ggplot(fire_data_new,
aes(x = inc_borough, y = inc_resp_min_qy, color = inc_borough)) + # ggplot function
geom_boxplot()# +
#labs(title = "Borough - Incident Number - Alarm Source", x = "Borough", y = "Incident Number", color = "Alarm Source")
ggplot(fire_data_new,
aes(x = al_source_desc, y = inc_resp_min_qy, color = al_source_desc)) + # ggplot function
geom_boxplot()# +
#labs(title = "Borough - Incident Number - Alarm Source", x = "Borough", y = "Incident Number", color = "Alarm Source")
ggplot(fire_data_new,
aes(x = day_type, y = inc_resp_min_qy, color = day_type)) + # ggplot function
geom_boxplot()# +
#labs(title = "Borough - Incident Number - Alarm Source", x = "Borough", y = "Incident Number", color = "Alarm Source")
ggplot(fire_data_new,
aes(x = time_of_day, y = inc_resp_min_qy, color = time_of_day)) + # ggplot function
geom_boxplot()# +
#labs(title = "Borough - Incident Number - Alarm Source", x = "Borough", y = "Incident Number", color = "Alarm Source")
ggplot(fire_data_new,
aes(x = inc_borough, y = others_units_assigned, color = inc_borough)) + # ggplot function
geom_boxplot()# +
In this section we plot additional data visualization focus on the geographical visualization of the New York borough with relative predictors. In order to do so we load an additional datasets:
The fdny-firehouse-listing.csv is a dataset that includes the geographical informations of every firefighter stations in the NYC, including again latitude and longitude.
firefighter_stations <- read.csv("datasets/fdny-firehouse-listing.csv")
head(firefighter_stations)
## FacilityName FacilityAddress
## 1 Engine 4/Ladder 15 42 South Street
## 2 Engine 10/Ladder 10 124 Liberty Street
## 3 Engine 6 49 Beekman Street
## 4 Engine 7/Ladder 1/Battalion 1/Manhattan Borough Command 100-104 Duane Street
## 5 Ladder 8 14 North Moore Street
## 6 Engine 9/Ladder 6 75 Canal Street
## Borough Postcode Latitude Longitude Community.Board Community.Council
## 1 Manhattan 10005 40.70347 -74.00754 1 1
## 2 Manhattan 10006 40.71007 -74.01252 1 1
## 3 Manhattan 10038 40.71005 -74.00525 1 1
## 4 Manhattan 10007 40.71546 -74.00594 1 1
## 5 Manhattan 10013 40.71976 -74.00668 1 1
## 6 Manhattan 10002 40.71521 -73.99290 3 1
## Census.Tract BIN BBL
## 1 7 1000867 1000350001
## 2 13 1075700 1000520022
## 3 1501 1001287 1000930030
## 4 33 1001647 1001500025
## 5 33 1002150 1001890035
## 6 16 1003898 1003000030
## NTA
## 1 Battery Park City-Lower Manhattan
## 2 Battery Park City-Lower Manhattan
## 3 Battery Park City-Lower Manhattan
## 4 SoHo-TriBeCa-Civic Center-Little Italy
## 5 SoHo-TriBeCa-Civic Center-Little Italy
## 6 Chinatown
summary(firefighter_stations)
## FacilityName FacilityAddress Borough Postcode
## Length:218 Length:218 Length:218 Min. :10001
## Class :character Class :character Class :character 1st Qu.:10304
## Mode :character Mode :character Mode :character Median :11103
## Mean :10784
## 3rd Qu.:11231
## Max. :11695
## NA's :5
## Latitude Longitude Community.Board Community.Council
## Min. :40.51 Min. :-74.24 Min. : 1.000 Min. : 1.00
## 1st Qu.:40.66 1st Qu.:-73.99 1st Qu.: 3.000 1st Qu.:12.00
## Median :40.72 Median :-73.94 Median : 6.000 Median :27.00
## Mean :40.72 Mean :-73.94 Mean : 7.075 Mean :25.63
## 3rd Qu.:40.77 3rd Qu.:-73.89 3rd Qu.:11.000 3rd Qu.:38.00
## Max. :40.89 Max. :-73.72 Max. :84.000 Max. :51.00
## NA's :5 NA's :5 NA's :5 NA's :5
## Census.Tract BIN BBL NTA
## Min. : 1 Min. :1000867 Min. :1.000e+09 Length:218
## 1st Qu.: 129 1st Qu.:2003268 1st Qu.:2.025e+09 Class :character
## Median : 275 Median :3064786 Median :3.025e+09 Mode :character
## Mean : 5950 Mean :2900421 Mean :2.850e+09
## 3rd Qu.: 800 3rd Qu.:4090228 3rd Qu.:4.033e+09
## Max. :157902 Max. :5154879 Max. :5.080e+09
## NA's :5 NA's :5 NA's :5
We now start with the firefighter stations dataset. By first making a copy of the fire_data_new and setting the borough from the firefighter_stations dataset to factor in order to be easily merged with the copied fire_data dataset.
# make a copy of the fire_data
fire_data_for_ffs <- fire_data_new
fire_data_for_ffs <- fire_data_for_ffs %>% rename(borough = inc_borough)
firefighter_stations$Borough <- as.factor(firefighter_stations$Borough)
firefighter_stations <- firefighter_stations %>% rename(borough = Borough)
# remove the na values from firefighter_stations
firefighter_stations <- na.omit(firefighter_stations)
Now we want to get the number of firefighter station for each borough.
stations_borough <- firefighter_stations %>%
group_by(borough) %>%
summarise(number_of_stations = n())
Now we want to get a summary of the incident count, the number of station and the incident per station of each borough in order to have a general view of the New York City situation.
count_inc_brough <- fire_data_for_ffs %>% group_by(borough) %>% summarise(incident_count = n())
stations_borough$incident_per_station <- round(count_inc_brough$incident_count / stations_borough$number_of_stations, digits = 3)
count_inc_brough <- merge(count_inc_brough, stations_borough, by="borough")
count_inc_brough
## borough incident_count number_of_stations incident_per_station
## 1 Bronx 6406 34 188.412
## 2 Brooklyn 9280 64 145.000
## 3 Manhattan 7294 47 155.191
## 4 Queens 6188 48 128.917
## 5 Staten Island 1560 20 78.000
Now we convert the firefighter_station data frame into a Spartial Data Frame to contains the geometry points.
firefighter_stations_sdf <- st_as_sf(firefighter_stations, coords = c("Longitude", "Latitude"), crs = 4326)
head(firefighter_stations_sdf)
## Simple feature collection with 6 features and 10 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -74.01252 ymin: 40.70347 xmax: -73.9929 ymax: 40.71976
## Geodetic CRS: WGS 84
## FacilityName FacilityAddress
## 1 Engine 4/Ladder 15 42 South Street
## 2 Engine 10/Ladder 10 124 Liberty Street
## 3 Engine 6 49 Beekman Street
## 4 Engine 7/Ladder 1/Battalion 1/Manhattan Borough Command 100-104 Duane Street
## 5 Ladder 8 14 North Moore Street
## 6 Engine 9/Ladder 6 75 Canal Street
## borough Postcode Community.Board Community.Council Census.Tract BIN
## 1 Manhattan 10005 1 1 7 1000867
## 2 Manhattan 10006 1 1 13 1075700
## 3 Manhattan 10038 1 1 1501 1001287
## 4 Manhattan 10007 1 1 33 1001647
## 5 Manhattan 10013 1 1 33 1002150
## 6 Manhattan 10002 3 1 16 1003898
## BBL
## 1 1000350001
## 2 1000520022
## 3 1000930030
## 4 1001500025
## 5 1001890035
## 6 1003000030
## NTA
## 1 Battery Park City-Lower Manhattan
## 2 Battery Park City-Lower Manhattan
## 3 Battery Park City-Lower Manhattan
## 4 SoHo-TriBeCa-Civic Center-Little Italy
## 5 SoHo-TriBeCa-Civic Center-Little Italy
## 6 Chinatown
## geometry
## 1 POINT (-74.00754 40.70347)
## 2 POINT (-74.01252 40.71007)
## 3 POINT (-74.00525 40.71005)
## 4 POINT (-74.00594 40.71546)
## 5 POINT (-74.00668 40.71976)
## 6 POINT (-73.9929 40.71521)
At this point we download the .geojson file that contain all the geometry of each borough in order to have a cool maps visualization of NYC.
geojson_newyork <- geojson_read("datasets/NYC_BoroughBoundaries.geojson", what = "sp")
geojson_newyork <- setNames(geojson_newyork, c("borough_code", "borough", "shape_area", "shape_leng"))
geojson_newyork$borough <- as.factor(geojson_newyork$borough)
geojson_newyork$borough_code <- NULL
head(geojson_newyork)
## borough shape_area shape_leng
## 1 Staten Island 1623620725.05 325917.35395
## 2 Manhattan 636520502.758 357713.308162
## 3 Bronx 1187174772.5 463180.579449
## 4 Brooklyn 1934138215.76 728146.574928
## 5 Queens 3041418506.64 888199.731385
And now we merge geojson_newyork with count_inc_brough maintaining the Spartial Data Frame type.
geojson_newyork@data = data.frame(geojson_newyork@data, count_inc_brough[match(geojson_newyork@data$borough, count_inc_brough$borough),])
geojson_newyork@data$borough.1 <- NULL
And finally we can plot the interactive map using the mapview function.
mapview(list(firefighter_stations_sdf, geojson_newyork),
zcol = list(NULL, "incident_count"),
legend = list(FALSE, TRUE),
homebutton = list(FALSE, TRUE), layer.name = list(NULL, "indicents_number"), alpha.regions = 0.5, aplha = 1)
As suggested by the professor we have opted to solve a regression problem with response first inc_resp_min_qy, then the emergency_min_qy and finally the ticket_time. Previously we were thinking to solve a multi-classification / binary classification problem for the inc_class_group, however we were considering all the time difference predictors that are a future information w.r.t. the inc_class_group in prediction time, so it doesn’t make much sense to use them, and it is possible also that they will result in super predictors. That’s way we decided to grab the professor suggestion.
For all three analysis we transform the relative response in log scale in order to simulate the behaviour of the Exponential and Gamma GLMs.
So first things first let’s check if there are some observations that have at least one of the time differences equal to zero.
summary(fire_data_new %>% select(disp_resp_min_qy, inc_travel_min_qy, inc_resp_min_qy, emergency_min_qy, ticket_time))
## disp_resp_min_qy inc_travel_min_qy inc_resp_min_qy emergency_min_qy
## Min. : 0.03333 Min. : 0.000 Min. : 0.350 Min. : 0.00
## 1st Qu.: 0.13333 1st Qu.: 3.883 1st Qu.: 4.383 1st Qu.: 7.15
## Median : 0.48333 Median : 4.983 Median : 5.467 Median : 11.93
## Mean : 0.50062 Mean : 5.448 Mean : 5.949 Mean : 17.08
## 3rd Qu.: 0.71667 3rd Qu.: 6.367 3rd Qu.: 6.850 3rd Qu.: 19.58
## Max. :34.73333 Max. :58.617 Max. :58.917 Max. :596.38
## ticket_time
## Min. : 1.083
## 1st Qu.: 12.850
## Median : 17.900
## Mean : 23.027
## 3rd Qu.: 25.954
## Max. :601.717
fire_data_new <- fire_data_new %>% filter(inc_travel_min_qy != 0, emergency_min_qy != 0)
But before creating any model have to split the cleaned dataset into train and test, with 0.8% of the whole dataset for the train set and the remaining 20% for the test set.
set.seed(43)
split <- sample.split(fire_data_new, SplitRatio = 0.8)
# Create training and testing sets
fire_data.train <- subset(fire_data_new, split == TRUE)
fire_data.test <- subset(fire_data_new, split == FALSE)
rownames(fire_data.train) <- NULL
rownames(fire_data.test) <- NULL
dim(fire_data.train)
## [1] 23495 17
dim(fire_data.test)
## [1] 7230 17
In this section we use inc_resp_min_qy as response, so we have to remove all the time difference predictors that are computed with one of the two datetime that comes after the incident datetime, so all the other except for our actual response.
# make a copy of the train and test
resp_min_fd.train <- fire_data.train
resp_min_fd.test <- fire_data.test
# remove the ticket_time
resp_min_fd.train <- resp_min_fd.train %>% select(-c(disp_resp_min_qy, inc_travel_min_qy, emergency_min_qy, ticket_time, total_assigned_unit))
resp_min_fd.test <- resp_min_fd.test %>% select(-c(disp_resp_min_qy, inc_travel_min_qy, emergency_min_qy, ticket_time, total_assigned_unit))
We decided to remove also the total_assigned_unit since it is in linear relation with the three assigned units predictors since it is the sum of the three.
Let’s build our first Linear Regression Model
lm_full <- lm(inc_resp_min_qy ~ ., data = resp_min_fd.train)
summary(lm_full)
##
## Call:
## lm(formula = inc_resp_min_qy ~ ., data = resp_min_fd.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.034 -1.382 -0.394 0.831 46.038
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.11260 0.78348 7.802 6.35e-15 ***
## inc_boroughBrooklyn -0.98784 0.04733 -20.870 < 2e-16 ***
## inc_boroughManhattan -0.13422 0.05001 -2.684 0.007282 **
## inc_boroughQueens -0.35510 0.05218 -6.806 1.03e-11 ***
## inc_boroughStaten Island -0.75228 0.08301 -9.063 < 2e-16 ***
## al_source_descEMS -0.07169 0.11209 -0.640 0.522425
## al_source_descEMS-911 -0.15750 0.11370 -1.385 0.165994
## al_source_descCLASS-3 0.09341 0.05548 1.684 0.092263 .
## al_source_descOthers -1.05292 0.11553 -9.114 < 2e-16 ***
## al_index_descInitial Alarm -0.24282 0.07216 -3.365 0.000766 ***
## al_index_descOthers 1.19195 0.75011 1.589 0.112065
## highest_al_levelFirst Alarm 0.56173 0.76799 0.731 0.464529
## highest_al_level2nd-3rd Alarm 3.64479 1.14235 3.191 0.001422 **
## inc_class_groupMedical MFAs -0.24731 0.24461 -1.011 0.312025
## inc_class_groupNonMedical Emergencies 0.50903 0.11084 4.593 4.40e-06 ***
## inc_class_groupNonMedical MFAs 0.45628 0.16017 2.849 0.004392 **
## inc_class_groupNonStructural Fires -0.11818 0.16497 -0.716 0.473769
## inc_class_groupStructural Fires -0.01450 0.14053 -0.103 0.917839
## engines_assigned -0.47023 0.02793 -16.838 < 2e-16 ***
## ladders_assigned 0.15343 0.04449 3.449 0.000564 ***
## others_units_assigned -0.11005 0.03039 -3.622 0.000293 ***
## day_typeWeekend -0.14779 0.03726 -3.966 7.33e-05 ***
## time_of_dayMorning -0.23390 0.05381 -4.347 1.39e-05 ***
## time_of_dayAfternoon -0.40711 0.05180 -7.859 4.05e-15 ***
## time_of_dayEvening -0.70855 0.05521 -12.833 < 2e-16 ***
## tua_is_oneY 0.98873 0.06251 15.816 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.536 on 23469 degrees of freedom
## Multiple R-squared: 0.116, Adjusted R-squared: 0.1151
## F-statistic: 123.2 on 25 and 23469 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lm_full)
qqPlot(residuals(lm_full))
## [1] 21285 8563
Let’s have a look of the possible power transformation of the response.
powerTransform(lm_full)
## Estimated transformation parameter
## Y1
## 0.0841656
The function powerTransform suggests to take the log-transformation of the response, we take the log transformation because the estimated value of lambda is very close to zero.
lm_full_upd <- update(lm_full, log(inc_resp_min_qy) ~ .)
summary(lm_full_upd)
##
## Call:
## lm(formula = log(inc_resp_min_qy) ~ inc_borough + al_source_desc +
## al_index_desc + highest_al_level + inc_class_group + engines_assigned +
## ladders_assigned + others_units_assigned + day_type + time_of_day +
## tua_is_one, data = resp_min_fd.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.39479 -0.20248 -0.00533 0.19972 2.66305
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.743668 0.114607 15.214 < 2e-16 ***
## inc_boroughBrooklyn -0.173593 0.006924 -25.072 < 2e-16 ***
## inc_boroughManhattan -0.035685 0.007315 -4.878 1.08e-06 ***
## inc_boroughQueens -0.054415 0.007633 -7.129 1.04e-12 ***
## inc_boroughStaten Island -0.125916 0.012142 -10.370 < 2e-16 ***
## al_source_descEMS -0.001089 0.016396 -0.066 0.947026
## al_source_descEMS-911 -0.015647 0.016632 -0.941 0.346843
## al_source_descCLASS-3 0.041419 0.008116 5.103 3.36e-07 ***
## al_source_descOthers -0.445624 0.016900 -26.368 < 2e-16 ***
## al_index_descInitial Alarm -0.023221 0.010555 -2.200 0.027819 *
## al_index_descOthers 0.153561 0.109726 1.400 0.161676
## highest_al_levelFirst Alarm 0.097751 0.112342 0.870 0.384245
## highest_al_level2nd-3rd Alarm 0.580649 0.167103 3.475 0.000512 ***
## inc_class_groupMedical MFAs -0.033303 0.035782 -0.931 0.352016
## inc_class_groupNonMedical Emergencies 0.090276 0.016213 5.568 2.60e-08 ***
## inc_class_groupNonMedical MFAs 0.084954 0.023429 3.626 0.000288 ***
## inc_class_groupNonStructural Fires -0.009855 0.024132 -0.408 0.682996
## inc_class_groupStructural Fires -0.019453 0.020557 -0.946 0.344011
## engines_assigned -0.072140 0.004085 -17.659 < 2e-16 ***
## ladders_assigned 0.009218 0.006508 1.416 0.156678
## others_units_assigned -0.012008 0.004445 -2.702 0.006907 **
## day_typeWeekend -0.018039 0.005451 -3.309 0.000937 ***
## time_of_dayMorning -0.071102 0.007872 -9.033 < 2e-16 ***
## time_of_dayAfternoon -0.092715 0.007578 -12.235 < 2e-16 ***
## time_of_dayEvening -0.132050 0.008077 -16.350 < 2e-16 ***
## tua_is_oneY 0.144696 0.009144 15.823 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3709 on 23469 degrees of freedom
## Multiple R-squared: 0.1574, Adjusted R-squared: 0.1565
## F-statistic: 175.3 on 25 and 23469 DF, p-value: < 2.2e-16
The \(R^2\) has increase w.r.t. the previous model. Let’s see the residual plots.
par(mfrow=c(2,2))
plot(lm_full_upd)
residualPlots(lm_full_upd)
## Test stat Pr(>|Test stat|)
## inc_borough
## al_source_desc
## al_index_desc
## highest_al_level
## inc_class_group
## engines_assigned 4.9681 6.808e-07 ***
## ladders_assigned 4.6840 2.829e-06 ***
## others_units_assigned 3.7977 0.0001464 ***
## day_type
## time_of_day
## tua_is_one
## Tukey test 4.8322 1.350e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qqPlot(residuals(lm_full_upd))
## [1] 8563 5950
influenceIndexPlot(lm_full_upd, vars = "Cook")
summary(fitted(lm_full_upd))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.5791 1.6111 1.7172 1.7018 1.7969 2.0803
lm_full_upd_2 <- update(lm_full, log(inc_resp_min_qy) ~ . + engines_assigned : inc_class_group + time_of_day : day_type + log(ladders_assigned + 1) + log(engines_assigned + 1) + log(others_units_assigned + 1) + log(engines_assigned + 1) : inc_class_group)
summary(lm_full_upd_2)
##
## Call:
## lm(formula = log(inc_resp_min_qy) ~ inc_borough + al_source_desc +
## al_index_desc + highest_al_level + inc_class_group + engines_assigned +
## ladders_assigned + others_units_assigned + day_type + time_of_day +
## tua_is_one + log(ladders_assigned + 1) + log(engines_assigned +
## 1) + log(others_units_assigned + 1) + inc_class_group:engines_assigned +
## day_type:time_of_day + inc_class_group:log(engines_assigned +
## 1), data = resp_min_fd.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.40880 -0.19919 -0.00243 0.20178 2.52229
##
## Coefficients:
## Estimate
## (Intercept) 1.6856537
## inc_boroughBrooklyn -0.1758520
## inc_boroughManhattan -0.0396859
## inc_boroughQueens -0.0553967
## inc_boroughStaten Island -0.1290309
## al_source_descEMS -0.0120352
## al_source_descEMS-911 -0.0264792
## al_source_descCLASS-3 0.0331886
## al_source_descOthers -0.4702979
## al_index_descInitial Alarm -0.0242975
## al_index_descOthers 0.0748675
## highest_al_levelFirst Alarm 0.1703187
## highest_al_level2nd-3rd Alarm -0.2416375
## inc_class_groupMedical MFAs -0.1357881
## inc_class_groupNonMedical Emergencies 0.4407230
## inc_class_groupNonMedical MFAs 0.5763845
## inc_class_groupNonStructural Fires 0.0811843
## inc_class_groupStructural Fires 0.3550350
## engines_assigned -0.2823196
## ladders_assigned 0.2882807
## others_units_assigned -0.0003893
## day_typeWeekend 0.0105528
## time_of_dayMorning -0.0588187
## time_of_dayAfternoon -0.0814243
## time_of_dayEvening -0.1394133
## tua_is_oneY -0.0365789
## log(ladders_assigned + 1) -0.6733789
## log(engines_assigned + 1) 0.5650244
## log(others_units_assigned + 1) -0.0490430
## inc_class_groupMedical MFAs:engines_assigned 1.7116229
## inc_class_groupNonMedical Emergencies:engines_assigned 0.4257164
## inc_class_groupNonMedical MFAs:engines_assigned 0.5579366
## inc_class_groupNonStructural Fires:engines_assigned 0.3594716
## inc_class_groupStructural Fires:engines_assigned 0.2278762
## day_typeWeekend:time_of_dayMorning -0.0559302
## day_typeWeekend:time_of_dayAfternoon -0.0558940
## day_typeWeekend:time_of_dayEvening 0.0267962
## inc_class_groupMedical MFAs:log(engines_assigned + 1) -2.3246591
## inc_class_groupNonMedical Emergencies:log(engines_assigned + 1) -1.1431006
## inc_class_groupNonMedical MFAs:log(engines_assigned + 1) -1.5970216
## inc_class_groupNonStructural Fires:log(engines_assigned + 1) -0.7601430
## inc_class_groupStructural Fires:log(engines_assigned + 1) -0.7741744
## Std. Error
## (Intercept) 0.1167524
## inc_boroughBrooklyn 0.0068516
## inc_boroughManhattan 0.0072369
## inc_boroughQueens 0.0075551
## inc_boroughStaten Island 0.0120139
## al_source_descEMS 0.0164834
## al_source_descEMS-911 0.0166904
## al_source_descCLASS-3 0.0082382
## al_source_descOthers 0.0168007
## al_index_descInitial Alarm 0.0104339
## al_index_descOthers 0.1147072
## highest_al_levelFirst Alarm 0.1121326
## highest_al_level2nd-3rd Alarm 0.2227851
## inc_class_groupMedical MFAs 0.1403131
## inc_class_groupNonMedical Emergencies 0.0284609
## inc_class_groupNonMedical MFAs 0.0407777
## inc_class_groupNonStructural Fires 0.0899348
## inc_class_groupStructural Fires 0.0594739
## engines_assigned 0.0821379
## ladders_assigned 0.0253938
## others_units_assigned 0.0095243
## day_typeWeekend 0.0134069
## time_of_dayMorning 0.0092904
## time_of_dayAfternoon 0.0089729
## time_of_dayEvening 0.0095828
## tua_is_oneY 0.0139051
## log(ladders_assigned + 1) 0.0490649
## log(engines_assigned + 1) 0.1305867
## log(others_units_assigned + 1) 0.0215114
## inc_class_groupMedical MFAs:engines_assigned 0.9186172
## inc_class_groupNonMedical Emergencies:engines_assigned 0.0834392
## inc_class_groupNonMedical MFAs:engines_assigned 0.0952255
## inc_class_groupNonStructural Fires:engines_assigned 0.1147301
## inc_class_groupStructural Fires:engines_assigned 0.0883516
## day_typeWeekend:time_of_dayMorning 0.0170393
## day_typeWeekend:time_of_dayAfternoon 0.0163233
## day_typeWeekend:time_of_dayEvening 0.0173291
## inc_class_groupMedical MFAs:log(engines_assigned + 1) 1.3906795
## inc_class_groupNonMedical Emergencies:log(engines_assigned + 1) 0.1361191
## inc_class_groupNonMedical MFAs:log(engines_assigned + 1) 0.1744356
## inc_class_groupNonStructural Fires:log(engines_assigned + 1) 0.2648748
## inc_class_groupStructural Fires:log(engines_assigned + 1) 0.1703332
## t value
## (Intercept) 14.438
## inc_boroughBrooklyn -25.666
## inc_boroughManhattan -5.484
## inc_boroughQueens -7.332
## inc_boroughStaten Island -10.740
## al_source_descEMS -0.730
## al_source_descEMS-911 -1.586
## al_source_descCLASS-3 4.029
## al_source_descOthers -27.993
## al_index_descInitial Alarm -2.329
## al_index_descOthers 0.653
## highest_al_levelFirst Alarm 1.519
## highest_al_level2nd-3rd Alarm -1.085
## inc_class_groupMedical MFAs -0.968
## inc_class_groupNonMedical Emergencies 15.485
## inc_class_groupNonMedical MFAs 14.135
## inc_class_groupNonStructural Fires 0.903
## inc_class_groupStructural Fires 5.970
## engines_assigned -3.437
## ladders_assigned 11.352
## others_units_assigned -0.041
## day_typeWeekend 0.787
## time_of_dayMorning -6.331
## time_of_dayAfternoon -9.074
## time_of_dayEvening -14.548
## tua_is_oneY -2.631
## log(ladders_assigned + 1) -13.724
## log(engines_assigned + 1) 4.327
## log(others_units_assigned + 1) -2.280
## inc_class_groupMedical MFAs:engines_assigned 1.863
## inc_class_groupNonMedical Emergencies:engines_assigned 5.102
## inc_class_groupNonMedical MFAs:engines_assigned 5.859
## inc_class_groupNonStructural Fires:engines_assigned 3.133
## inc_class_groupStructural Fires:engines_assigned 2.579
## day_typeWeekend:time_of_dayMorning -3.282
## day_typeWeekend:time_of_dayAfternoon -3.424
## day_typeWeekend:time_of_dayEvening 1.546
## inc_class_groupMedical MFAs:log(engines_assigned + 1) -1.672
## inc_class_groupNonMedical Emergencies:log(engines_assigned + 1) -8.398
## inc_class_groupNonMedical MFAs:log(engines_assigned + 1) -9.155
## inc_class_groupNonStructural Fires:log(engines_assigned + 1) -2.870
## inc_class_groupStructural Fires:log(engines_assigned + 1) -4.545
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## inc_boroughBrooklyn < 2e-16 ***
## inc_boroughManhattan 4.21e-08 ***
## inc_boroughQueens 2.33e-13 ***
## inc_boroughStaten Island < 2e-16 ***
## al_source_descEMS 0.465312
## al_source_descEMS-911 0.112640
## al_source_descCLASS-3 5.63e-05 ***
## al_source_descOthers < 2e-16 ***
## al_index_descInitial Alarm 0.019883 *
## al_index_descOthers 0.513967
## highest_al_levelFirst Alarm 0.128800
## highest_al_level2nd-3rd Alarm 0.278101
## inc_class_groupMedical MFAs 0.333179
## inc_class_groupNonMedical Emergencies < 2e-16 ***
## inc_class_groupNonMedical MFAs < 2e-16 ***
## inc_class_groupNonStructural Fires 0.366694
## inc_class_groupStructural Fires 2.41e-09 ***
## engines_assigned 0.000589 ***
## ladders_assigned < 2e-16 ***
## others_units_assigned 0.967393
## day_typeWeekend 0.431221
## time_of_dayMorning 2.48e-10 ***
## time_of_dayAfternoon < 2e-16 ***
## time_of_dayEvening < 2e-16 ***
## tua_is_oneY 0.008529 **
## log(ladders_assigned + 1) < 2e-16 ***
## log(engines_assigned + 1) 1.52e-05 ***
## log(others_units_assigned + 1) 0.022625 *
## inc_class_groupMedical MFAs:engines_assigned 0.062438 .
## inc_class_groupNonMedical Emergencies:engines_assigned 3.38e-07 ***
## inc_class_groupNonMedical MFAs:engines_assigned 4.72e-09 ***
## inc_class_groupNonStructural Fires:engines_assigned 0.001731 **
## inc_class_groupStructural Fires:engines_assigned 0.009909 **
## day_typeWeekend:time_of_dayMorning 0.001031 **
## day_typeWeekend:time_of_dayAfternoon 0.000618 ***
## day_typeWeekend:time_of_dayEvening 0.122043
## inc_class_groupMedical MFAs:log(engines_assigned + 1) 0.094617 .
## inc_class_groupNonMedical Emergencies:log(engines_assigned + 1) < 2e-16 ***
## inc_class_groupNonMedical MFAs:log(engines_assigned + 1) < 2e-16 ***
## inc_class_groupNonStructural Fires:log(engines_assigned + 1) 0.004111 **
## inc_class_groupStructural Fires:log(engines_assigned + 1) 5.52e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3665 on 23453 degrees of freedom
## Multiple R-squared: 0.178, Adjusted R-squared: 0.1765
## F-statistic: 123.8 on 41 and 23453 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lm_full_upd_2)
## Warning: non si riesce a fare il plot senza sfruttarne uno:
## 3754
residualPlots(lm_full_upd_2)
## Test stat Pr(>|Test stat|)
## inc_borough
## al_source_desc
## al_index_desc
## highest_al_level
## inc_class_group
## engines_assigned -7.1318 1.020e-12 ***
## ladders_assigned -8.3001 < 2.2e-16 ***
## others_units_assigned -6.0325 1.639e-09 ***
## day_type
## time_of_day
## tua_is_one
## log(ladders_assigned + 1) 10.9903 < 2.2e-16 ***
## log(engines_assigned + 1) 10.2240 < 2.2e-16 ***
## log(others_units_assigned + 1) 6.2572 3.988e-10 ***
## Tukey test 3.1056 0.001899 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qqPlot(residuals(lm_full_upd_2))
## [1] 8563 5950
influenceIndexPlot(lm_full_upd_2, vars = "Cook")
summary(fitted(lm_full_upd_2))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6146 1.6020 1.7133 1.7018 1.8022 2.3719
Let’s see the influential point and check how it behaves
resp_min_fd.train[10036,]
## inc_borough al_source_desc al_index_desc highest_al_level
## 10036 Brooklyn PHONE Others 2nd-3rd Alarm
## inc_class_group inc_resp_min_qy engines_assigned ladders_assigned
## 10036 Structural Fires 2.433333 19 15
## others_units_assigned day_type time_of_day tua_is_one
## 10036 32 Weekend Morning N
Now we investigate on why the 10036 observation is an influential point. Let’s see how is the behaviour of the logarithm scale of the assigned units for the Structural Fire and see if we can find the observation 10036
infl_point <- subset(resp_min_fd.train, inc_class_group == "Structural Fires")
par(mfrow = c(1, 3))
boxplot(log(infl_point$engines_assigned + 1), main = "Engines Assigned")
points(log(resp_min_fd.train[10036, "engines_assigned"] + 1), col = "red", pch = 16)
boxplot(log(infl_point$ladders_assigned + 1), main = "Ladders Assigned")
points(log(resp_min_fd.train[10036, "ladders_assigned"] + 1), col = "red", pch = 16)
boxplot(log(infl_point$others_units_assigned + 1), main = "Other Units Assigned")
points(log(resp_min_fd.train[10036, "others_units_assigned"] + 1), col = "red", pch = 16)
And we see that the observed incident is far away from the distribution of others assigned units for the Medical MFAs incident, so we decide to remove this observation during the refitting of the last model.
lm_full_upd_3 <- update(lm_full_upd_2, subset = -10036)
summary(lm_full_upd_3)
##
## Call:
## lm(formula = log(inc_resp_min_qy) ~ inc_borough + al_source_desc +
## al_index_desc + highest_al_level + inc_class_group + engines_assigned +
## ladders_assigned + others_units_assigned + day_type + time_of_day +
## tua_is_one + log(ladders_assigned + 1) + log(engines_assigned +
## 1) + log(others_units_assigned + 1) + inc_class_group:engines_assigned +
## day_type:time_of_day + inc_class_group:log(engines_assigned +
## 1), data = resp_min_fd.train, subset = -10036)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.40856 -0.19899 -0.00244 0.20206 2.51954
##
## Coefficients:
## Estimate
## (Intercept) 1.714774
## inc_boroughBrooklyn -0.175491
## inc_boroughManhattan -0.039737
## inc_boroughQueens -0.055430
## inc_boroughStaten Island -0.129773
## al_source_descEMS -0.013668
## al_source_descEMS-911 -0.028089
## al_source_descCLASS-3 0.034611
## al_source_descOthers -0.471130
## al_index_descInitial Alarm -0.024200
## al_index_descOthers -0.036565
## highest_al_levelFirst Alarm 0.151762
## highest_al_level2nd-3rd Alarm -0.371511
## inc_class_groupMedical MFAs -0.135355
## inc_class_groupNonMedical Emergencies 0.441381
## inc_class_groupNonMedical MFAs 0.576601
## inc_class_groupNonStructural Fires 0.078230
## inc_class_groupStructural Fires 0.444113
## engines_assigned -0.287836
## ladders_assigned 0.314006
## others_units_assigned 0.005544
## day_typeWeekend 0.011109
## time_of_dayMorning -0.058671
## time_of_dayAfternoon -0.081279
## time_of_dayEvening -0.138707
## tua_is_oneY -0.041891
## log(ladders_assigned + 1) -0.721062
## log(engines_assigned + 1) 0.567071
## log(others_units_assigned + 1) -0.063727
## inc_class_groupMedical MFAs:engines_assigned 1.722178
## inc_class_groupNonMedical Emergencies:engines_assigned 0.431075
## inc_class_groupNonMedical MFAs:engines_assigned 0.564726
## inc_class_groupNonStructural Fires:engines_assigned 0.365763
## inc_class_groupStructural Fires:engines_assigned 0.317236
## day_typeWeekend:time_of_dayMorning -0.055151
## day_typeWeekend:time_of_dayAfternoon -0.056175
## day_typeWeekend:time_of_dayEvening 0.025889
## inc_class_groupMedical MFAs:log(engines_assigned + 1) -2.340072
## inc_class_groupNonMedical Emergencies:log(engines_assigned + 1) -1.150681
## inc_class_groupNonMedical MFAs:log(engines_assigned + 1) -1.607398
## inc_class_groupNonStructural Fires:log(engines_assigned + 1) -0.765520
## inc_class_groupStructural Fires:log(engines_assigned + 1) -1.033113
## Std. Error
## (Intercept) 0.116849
## inc_boroughBrooklyn 0.006849
## inc_boroughManhattan 0.007233
## inc_boroughQueens 0.007551
## inc_boroughStaten Island 0.012009
## al_source_descEMS 0.016479
## al_source_descEMS-911 0.016686
## al_source_descCLASS-3 0.008239
## al_source_descOthers 0.016793
## al_index_descInitial Alarm 0.010429
## al_index_descOthers 0.116918
## highest_al_levelFirst Alarm 0.112143
## highest_al_level2nd-3rd Alarm 0.224272
## inc_class_groupMedical MFAs 0.140245
## inc_class_groupNonMedical Emergencies 0.028448
## inc_class_groupNonMedical MFAs 0.040758
## inc_class_groupNonStructural Fires 0.089893
## inc_class_groupStructural Fires 0.062201
## engines_assigned 0.082106
## ladders_assigned 0.025927
## others_units_assigned 0.009597
## day_typeWeekend 0.013401
## time_of_dayMorning 0.009286
## time_of_dayAfternoon 0.008969
## time_of_dayEvening 0.009579
## tua_is_oneY 0.013941
## log(ladders_assigned + 1) 0.050011
## log(engines_assigned + 1) 0.130524
## log(others_units_assigned + 1) 0.021712
## inc_class_groupMedical MFAs:engines_assigned 0.918176
## inc_class_groupNonMedical Emergencies:engines_assigned 0.083406
## inc_class_groupNonMedical MFAs:engines_assigned 0.095190
## inc_class_groupNonStructural Fires:engines_assigned 0.114682
## inc_class_groupStructural Fires:engines_assigned 0.090199
## day_typeWeekend:time_of_dayMorning 0.017032
## day_typeWeekend:time_of_dayAfternoon 0.016316
## day_typeWeekend:time_of_dayEvening 0.017322
## inc_class_groupMedical MFAs:log(engines_assigned + 1) 1.390012
## inc_class_groupNonMedical Emergencies:log(engines_assigned + 1) 0.136062
## inc_class_groupNonMedical MFAs:log(engines_assigned + 1) 0.174364
## inc_class_groupNonStructural Fires:log(engines_assigned + 1) 0.264749
## inc_class_groupStructural Fires:log(engines_assigned + 1) 0.178378
## t value
## (Intercept) 14.675
## inc_boroughBrooklyn -25.624
## inc_boroughManhattan -5.494
## inc_boroughQueens -7.340
## inc_boroughStaten Island -10.806
## al_source_descEMS -0.829
## al_source_descEMS-911 -1.683
## al_source_descCLASS-3 4.201
## al_source_descOthers -28.054
## al_index_descInitial Alarm -2.320
## al_index_descOthers -0.313
## highest_al_levelFirst Alarm 1.353
## highest_al_level2nd-3rd Alarm -1.657
## inc_class_groupMedical MFAs -0.965
## inc_class_groupNonMedical Emergencies 15.516
## inc_class_groupNonMedical MFAs 14.147
## inc_class_groupNonStructural Fires 0.870
## inc_class_groupStructural Fires 7.140
## engines_assigned -3.506
## ladders_assigned 12.111
## others_units_assigned 0.578
## day_typeWeekend 0.829
## time_of_dayMorning -6.318
## time_of_dayAfternoon -9.063
## time_of_dayEvening -14.480
## tua_is_oneY -3.005
## log(ladders_assigned + 1) -14.418
## log(engines_assigned + 1) 4.345
## log(others_units_assigned + 1) -2.935
## inc_class_groupMedical MFAs:engines_assigned 1.876
## inc_class_groupNonMedical Emergencies:engines_assigned 5.168
## inc_class_groupNonMedical MFAs:engines_assigned 5.933
## inc_class_groupNonStructural Fires:engines_assigned 3.189
## inc_class_groupStructural Fires:engines_assigned 3.517
## day_typeWeekend:time_of_dayMorning -3.238
## day_typeWeekend:time_of_dayAfternoon -3.443
## day_typeWeekend:time_of_dayEvening 1.495
## inc_class_groupMedical MFAs:log(engines_assigned + 1) -1.683
## inc_class_groupNonMedical Emergencies:log(engines_assigned + 1) -8.457
## inc_class_groupNonMedical MFAs:log(engines_assigned + 1) -9.219
## inc_class_groupNonStructural Fires:log(engines_assigned + 1) -2.891
## inc_class_groupStructural Fires:log(engines_assigned + 1) -5.792
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## inc_boroughBrooklyn < 2e-16 ***
## inc_boroughManhattan 3.98e-08 ***
## inc_boroughQueens 2.20e-13 ***
## inc_boroughStaten Island < 2e-16 ***
## al_source_descEMS 0.406876
## al_source_descEMS-911 0.092308 .
## al_source_descCLASS-3 2.67e-05 ***
## al_source_descOthers < 2e-16 ***
## al_index_descInitial Alarm 0.020326 *
## al_index_descOthers 0.754480
## highest_al_levelFirst Alarm 0.175979
## highest_al_level2nd-3rd Alarm 0.097630 .
## inc_class_groupMedical MFAs 0.334490
## inc_class_groupNonMedical Emergencies < 2e-16 ***
## inc_class_groupNonMedical MFAs < 2e-16 ***
## inc_class_groupNonStructural Fires 0.384172
## inc_class_groupStructural Fires 9.61e-13 ***
## engines_assigned 0.000456 ***
## ladders_assigned < 2e-16 ***
## others_units_assigned 0.563537
## day_typeWeekend 0.407116
## time_of_dayMorning 2.69e-10 ***
## time_of_dayAfternoon < 2e-16 ***
## time_of_dayEvening < 2e-16 ***
## tua_is_oneY 0.002660 **
## log(ladders_assigned + 1) < 2e-16 ***
## log(engines_assigned + 1) 1.40e-05 ***
## log(others_units_assigned + 1) 0.003338 **
## inc_class_groupMedical MFAs:engines_assigned 0.060716 .
## inc_class_groupNonMedical Emergencies:engines_assigned 2.38e-07 ***
## inc_class_groupNonMedical MFAs:engines_assigned 3.02e-09 ***
## inc_class_groupNonStructural Fires:engines_assigned 0.001428 **
## inc_class_groupStructural Fires:engines_assigned 0.000437 ***
## day_typeWeekend:time_of_dayMorning 0.001205 **
## day_typeWeekend:time_of_dayAfternoon 0.000576 ***
## day_typeWeekend:time_of_dayEvening 0.135037
## inc_class_groupMedical MFAs:log(engines_assigned + 1) 0.092293 .
## inc_class_groupNonMedical Emergencies:log(engines_assigned + 1) < 2e-16 ***
## inc_class_groupNonMedical MFAs:log(engines_assigned + 1) < 2e-16 ***
## inc_class_groupNonStructural Fires:log(engines_assigned + 1) 0.003838 **
## inc_class_groupStructural Fires:log(engines_assigned + 1) 7.06e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3663 on 23452 degrees of freedom
## Multiple R-squared: 0.1786, Adjusted R-squared: 0.1772
## F-statistic: 124.4 on 41 and 23452 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lm_full_upd_3)
## Warning in sqrt(crit * p * (1 - hh)/hh): Si è prodotto un NaN
## Warning in sqrt(crit * p * (1 - hh)/hh): Si è prodotto un NaN
residualPlots(lm_full_upd_3)
## Test stat Pr(>|Test stat|)
## inc_borough
## al_source_desc
## al_index_desc
## highest_al_level
## inc_class_group
## engines_assigned -5.7614 8.444e-09 ***
## ladders_assigned -9.1856 < 2.2e-16 ***
## others_units_assigned -3.5677 0.0003609 ***
## day_type
## time_of_day
## tua_is_one
## log(ladders_assigned + 1) 10.2672 < 2.2e-16 ***
## log(engines_assigned + 1) 9.0238 < 2.2e-16 ***
## log(others_units_assigned + 1) 4.7254 2.310e-06 ***
## Tukey test 2.9859 0.0028271 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qqPlot(residuals(lm_full_upd_3))
## [1] 8563 5950
influenceIndexPlot(lm_full_upd_3, vars = "Cook")
summary(fitted(lm_full_upd_3))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6677 1.6025 1.7132 1.7019 1.8021 2.3737
resp_min_fd.train[3754,]
## inc_borough al_source_desc al_index_desc highest_al_level inc_class_group
## 3754 Bronx EMS DEFAULT RECORD First Alarm Medical MFAs
## inc_resp_min_qy engines_assigned ladders_assigned others_units_assigned
## 3754 10.06667 2 2 1
## day_type time_of_day tua_is_one
## 3754 Weekend Afternoon N
infl_point <- subset(resp_min_fd.train, inc_class_group == "Medical MFAs" & inc_borough == "Bronx")
par(mfrow = c(1, 3))
boxplot(log(infl_point$engines_assigned + 1), main = "Engines Assigned")
points(log(resp_min_fd.train[3754, "engines_assigned"] + 1), col = "red", pch = 16)
boxplot(log(infl_point$ladders_assigned + 1), main = "Ladders Assigned")
points(log(resp_min_fd.train[3754, "ladders_assigned"] + 1), col = "red", pch = 16)
boxplot(log(infl_point$others_units_assigned + 1), main = "Other Units Assigned")
points(log(resp_min_fd.train[3754, "others_units_assigned"] + 1), col = "red", pch = 16)
lm_full_upd_4 <- update(lm_full_upd_2, subset = -c(10036, 3754))
summary(lm_full_upd_4)
##
## Call:
## lm(formula = log(inc_resp_min_qy) ~ inc_borough + al_source_desc +
## al_index_desc + highest_al_level + inc_class_group + engines_assigned +
## ladders_assigned + others_units_assigned + day_type + time_of_day +
## tua_is_one + log(ladders_assigned + 1) + log(engines_assigned +
## 1) + log(others_units_assigned + 1) + inc_class_group:engines_assigned +
## day_type:time_of_day + inc_class_group:log(engines_assigned +
## 1), data = resp_min_fd.train, subset = -c(10036, 3754))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.40856 -0.19901 -0.00244 0.20207 2.51954
##
## Coefficients: (1 not defined because of singularities)
## Estimate
## (Intercept) 1.714774
## inc_boroughBrooklyn -0.175491
## inc_boroughManhattan -0.039737
## inc_boroughQueens -0.055430
## inc_boroughStaten Island -0.129773
## al_source_descEMS -0.013668
## al_source_descEMS-911 -0.028089
## al_source_descCLASS-3 0.034611
## al_source_descOthers -0.471130
## al_index_descInitial Alarm -0.024200
## al_index_descOthers -0.036565
## highest_al_levelFirst Alarm 0.151762
## highest_al_level2nd-3rd Alarm -0.371511
## inc_class_groupMedical MFAs -0.135355
## inc_class_groupNonMedical Emergencies 0.441381
## inc_class_groupNonMedical MFAs 0.576601
## inc_class_groupNonStructural Fires 0.078230
## inc_class_groupStructural Fires 0.444113
## engines_assigned -0.287836
## ladders_assigned 0.314006
## others_units_assigned 0.005544
## day_typeWeekend 0.011109
## time_of_dayMorning -0.058671
## time_of_dayAfternoon -0.081279
## time_of_dayEvening -0.138707
## tua_is_oneY -0.041891
## log(ladders_assigned + 1) -0.721062
## log(engines_assigned + 1) 0.567071
## log(others_units_assigned + 1) -0.063727
## inc_class_groupMedical MFAs:engines_assigned 0.100164
## inc_class_groupNonMedical Emergencies:engines_assigned 0.431075
## inc_class_groupNonMedical MFAs:engines_assigned 0.564726
## inc_class_groupNonStructural Fires:engines_assigned 0.365763
## inc_class_groupStructural Fires:engines_assigned 0.317236
## day_typeWeekend:time_of_dayMorning -0.055151
## day_typeWeekend:time_of_dayAfternoon -0.056175
## day_typeWeekend:time_of_dayEvening 0.025889
## inc_class_groupMedical MFAs:log(engines_assigned + 1) NA
## inc_class_groupNonMedical Emergencies:log(engines_assigned + 1) -1.150681
## inc_class_groupNonMedical MFAs:log(engines_assigned + 1) -1.607398
## inc_class_groupNonStructural Fires:log(engines_assigned + 1) -0.765520
## inc_class_groupStructural Fires:log(engines_assigned + 1) -1.033113
## Std. Error
## (Intercept) 0.116849
## inc_boroughBrooklyn 0.006849
## inc_boroughManhattan 0.007233
## inc_boroughQueens 0.007551
## inc_boroughStaten Island 0.012009
## al_source_descEMS 0.016479
## al_source_descEMS-911 0.016686
## al_source_descCLASS-3 0.008239
## al_source_descOthers 0.016793
## al_index_descInitial Alarm 0.010429
## al_index_descOthers 0.116918
## highest_al_levelFirst Alarm 0.112143
## highest_al_level2nd-3rd Alarm 0.224272
## inc_class_groupMedical MFAs 0.140245
## inc_class_groupNonMedical Emergencies 0.028448
## inc_class_groupNonMedical MFAs 0.040758
## inc_class_groupNonStructural Fires 0.089893
## inc_class_groupStructural Fires 0.062201
## engines_assigned 0.082106
## ladders_assigned 0.025927
## others_units_assigned 0.009597
## day_typeWeekend 0.013401
## time_of_dayMorning 0.009286
## time_of_dayAfternoon 0.008969
## time_of_dayEvening 0.009579
## tua_is_oneY 0.013941
## log(ladders_assigned + 1) 0.050011
## log(engines_assigned + 1) 0.130524
## log(others_units_assigned + 1) 0.021712
## inc_class_groupMedical MFAs:engines_assigned 0.144416
## inc_class_groupNonMedical Emergencies:engines_assigned 0.083406
## inc_class_groupNonMedical MFAs:engines_assigned 0.095190
## inc_class_groupNonStructural Fires:engines_assigned 0.114682
## inc_class_groupStructural Fires:engines_assigned 0.090199
## day_typeWeekend:time_of_dayMorning 0.017032
## day_typeWeekend:time_of_dayAfternoon 0.016316
## day_typeWeekend:time_of_dayEvening 0.017322
## inc_class_groupMedical MFAs:log(engines_assigned + 1) NA
## inc_class_groupNonMedical Emergencies:log(engines_assigned + 1) 0.136062
## inc_class_groupNonMedical MFAs:log(engines_assigned + 1) 0.174364
## inc_class_groupNonStructural Fires:log(engines_assigned + 1) 0.264749
## inc_class_groupStructural Fires:log(engines_assigned + 1) 0.178378
## t value
## (Intercept) 14.675
## inc_boroughBrooklyn -25.624
## inc_boroughManhattan -5.494
## inc_boroughQueens -7.340
## inc_boroughStaten Island -10.806
## al_source_descEMS -0.829
## al_source_descEMS-911 -1.683
## al_source_descCLASS-3 4.201
## al_source_descOthers -28.054
## al_index_descInitial Alarm -2.320
## al_index_descOthers -0.313
## highest_al_levelFirst Alarm 1.353
## highest_al_level2nd-3rd Alarm -1.657
## inc_class_groupMedical MFAs -0.965
## inc_class_groupNonMedical Emergencies 15.516
## inc_class_groupNonMedical MFAs 14.147
## inc_class_groupNonStructural Fires 0.870
## inc_class_groupStructural Fires 7.140
## engines_assigned -3.506
## ladders_assigned 12.111
## others_units_assigned 0.578
## day_typeWeekend 0.829
## time_of_dayMorning -6.318
## time_of_dayAfternoon -9.063
## time_of_dayEvening -14.480
## tua_is_oneY -3.005
## log(ladders_assigned + 1) -14.418
## log(engines_assigned + 1) 4.345
## log(others_units_assigned + 1) -2.935
## inc_class_groupMedical MFAs:engines_assigned 0.694
## inc_class_groupNonMedical Emergencies:engines_assigned 5.168
## inc_class_groupNonMedical MFAs:engines_assigned 5.933
## inc_class_groupNonStructural Fires:engines_assigned 3.189
## inc_class_groupStructural Fires:engines_assigned 3.517
## day_typeWeekend:time_of_dayMorning -3.238
## day_typeWeekend:time_of_dayAfternoon -3.443
## day_typeWeekend:time_of_dayEvening 1.495
## inc_class_groupMedical MFAs:log(engines_assigned + 1) NA
## inc_class_groupNonMedical Emergencies:log(engines_assigned + 1) -8.457
## inc_class_groupNonMedical MFAs:log(engines_assigned + 1) -9.219
## inc_class_groupNonStructural Fires:log(engines_assigned + 1) -2.891
## inc_class_groupStructural Fires:log(engines_assigned + 1) -5.792
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## inc_boroughBrooklyn < 2e-16 ***
## inc_boroughManhattan 3.98e-08 ***
## inc_boroughQueens 2.20e-13 ***
## inc_boroughStaten Island < 2e-16 ***
## al_source_descEMS 0.406876
## al_source_descEMS-911 0.092308 .
## al_source_descCLASS-3 2.67e-05 ***
## al_source_descOthers < 2e-16 ***
## al_index_descInitial Alarm 0.020326 *
## al_index_descOthers 0.754480
## highest_al_levelFirst Alarm 0.175979
## highest_al_level2nd-3rd Alarm 0.097630 .
## inc_class_groupMedical MFAs 0.334490
## inc_class_groupNonMedical Emergencies < 2e-16 ***
## inc_class_groupNonMedical MFAs < 2e-16 ***
## inc_class_groupNonStructural Fires 0.384172
## inc_class_groupStructural Fires 9.61e-13 ***
## engines_assigned 0.000456 ***
## ladders_assigned < 2e-16 ***
## others_units_assigned 0.563537
## day_typeWeekend 0.407116
## time_of_dayMorning 2.69e-10 ***
## time_of_dayAfternoon < 2e-16 ***
## time_of_dayEvening < 2e-16 ***
## tua_is_oneY 0.002660 **
## log(ladders_assigned + 1) < 2e-16 ***
## log(engines_assigned + 1) 1.40e-05 ***
## log(others_units_assigned + 1) 0.003338 **
## inc_class_groupMedical MFAs:engines_assigned 0.487954
## inc_class_groupNonMedical Emergencies:engines_assigned 2.38e-07 ***
## inc_class_groupNonMedical MFAs:engines_assigned 3.02e-09 ***
## inc_class_groupNonStructural Fires:engines_assigned 0.001428 **
## inc_class_groupStructural Fires:engines_assigned 0.000437 ***
## day_typeWeekend:time_of_dayMorning 0.001205 **
## day_typeWeekend:time_of_dayAfternoon 0.000576 ***
## day_typeWeekend:time_of_dayEvening 0.135037
## inc_class_groupMedical MFAs:log(engines_assigned + 1) NA
## inc_class_groupNonMedical Emergencies:log(engines_assigned + 1) < 2e-16 ***
## inc_class_groupNonMedical MFAs:log(engines_assigned + 1) < 2e-16 ***
## inc_class_groupNonStructural Fires:log(engines_assigned + 1) 0.003838 **
## inc_class_groupStructural Fires:log(engines_assigned + 1) 7.06e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3663 on 23452 degrees of freedom
## Multiple R-squared: 0.1786, Adjusted R-squared: 0.1772
## F-statistic: 127.4 on 40 and 23452 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lm_full_upd_4)
residualPlots(lm_full_upd_4)
## Test stat Pr(>|Test stat|)
## inc_borough
## al_source_desc
## al_index_desc
## highest_al_level
## inc_class_group
## engines_assigned -5.7614 8.444e-09 ***
## ladders_assigned -9.1856 < 2.2e-16 ***
## others_units_assigned -3.5677 0.0003609 ***
## day_type
## time_of_day
## tua_is_one
## log(ladders_assigned + 1) 10.2672 < 2.2e-16 ***
## log(engines_assigned + 1) 9.0238 < 2.2e-16 ***
## log(others_units_assigned + 1) 4.7254 2.310e-06 ***
## Tukey test 2.9859 0.0028271 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qqPlot(residuals(lm_full_upd_4))
## 8563 5950
## 8562 5949
influenceIndexPlot(lm_full_upd_4, vars = "Cook")
summary(fitted(lm_full_upd_4))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6677 1.6025 1.7132 1.7019 1.8021 2.3737
Tring also GAM with splines
library(splines)
library(gam)
## Caricamento del pacchetto richiesto: foreach
##
## Caricamento pacchetto: 'foreach'
## I seguenti oggetti sono mascherati da 'package:purrr':
##
## accumulate, when
## Loaded gam 1.22-3
##
## Caricamento pacchetto: 'gam'
## I seguenti oggetti sono mascherati da 'package:mgcv':
##
## gam, gam.control, gam.fit, s
fit.gam_1 <- gam(log(inc_resp_min_qy) ~ inc_borough + al_source_desc +
al_index_desc + highest_al_level + inc_class_group + s(engines_assigned, 5) +
s(ladders_assigned, 5) + s(others_units_assigned, 5) + day_type + time_of_day +
tua_is_one + inc_class_group:engines_assigned +
day_type:time_of_day, data = resp_min_fd.train)
summary(fit.gam_1)
##
## Call: gam(formula = log(inc_resp_min_qy) ~ inc_borough + al_source_desc +
## al_index_desc + highest_al_level + inc_class_group + s(engines_assigned,
## 5) + s(ladders_assigned, 5) + s(others_units_assigned, 5) +
## day_type + time_of_day + tua_is_one + inc_class_group:engines_assigned +
## day_type:time_of_day, data = resp_min_fd.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.404920 -0.198418 -0.002595 0.201188 2.563805
##
## (Dispersion Parameter for gaussian family taken to be 0.1337)
##
## Null Deviance: 3831.721 on 23494 degrees of freedom
## Residual Deviance: 3134.168 on 23449 degrees of freedom
## AIC: 19441.03
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## inc_borough 4 123.65 30.913 231.2808 < 2.2e-16 ***
## al_source_desc 4 108.92 27.231 203.7357 < 2.2e-16 ***
## al_index_desc 2 101.14 50.570 378.3522 < 2.2e-16 ***
## highest_al_level 2 17.89 8.944 66.9148 < 2.2e-16 ***
## inc_class_group 5 73.92 14.784 110.6093 < 2.2e-16 ***
## s(engines_assigned, 5) 1 300.66 300.660 2249.4542 < 2.2e-16 ***
## s(ladders_assigned, 5) 1 23.71 23.707 177.3663 < 2.2e-16 ***
## s(others_units_assigned, 5) 1 10.74 10.741 80.3632 < 2.2e-16 ***
## day_type 1 0.89 0.893 6.6811 0.0097502 **
## time_of_day 3 38.93 12.977 97.0868 < 2.2e-16 ***
## tua_is_one 1 1.88 1.878 14.0479 0.0001786 ***
## inc_class_group:engines_assigned 5 33.61 6.723 50.2984 < 2.2e-16 ***
## day_type:time_of_day 3 5.84 1.946 14.5616 1.795e-09 ***
## Residuals 23449 3134.17 0.134
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## inc_borough
## al_source_desc
## al_index_desc
## highest_al_level
## inc_class_group
## s(engines_assigned, 5) 4 72.086 < 2.2e-16 ***
## s(ladders_assigned, 5) 4 71.902 < 2.2e-16 ***
## s(others_units_assigned, 5) 4 25.917 < 2.2e-16 ***
## day_type
## time_of_day
## tua_is_one
## inc_class_group:engines_assigned
## day_type:time_of_day
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(fit.gam_1, se=TRUE, col="blue", lwd=2)
## Warning in preplot.Gam(x, terms = terms): No terms saved for "a:b" style
## interaction terms
fit.gam_2 <- gam(log(inc_resp_min_qy) ~ inc_borough + al_source_desc +
al_index_desc + highest_al_level + inc_class_group + s(engines_assigned, 5) +
s(ladders_assigned, 5) + s(others_units_assigned, 5) + day_type + time_of_day +
tua_is_one + log(ladders_assigned + 1) + log(engines_assigned +
1) + log(others_units_assigned + 1) + inc_class_group:engines_assigned +
day_type:time_of_day + inc_class_group:log(engines_assigned +
1), data = resp_min_fd.train)
summary(fit.gam_2)
##
## Call: gam(formula = log(inc_resp_min_qy) ~ inc_borough + al_source_desc +
## al_index_desc + highest_al_level + inc_class_group + s(engines_assigned,
## 5) + s(ladders_assigned, 5) + s(others_units_assigned, 5) +
## day_type + time_of_day + tua_is_one + log(ladders_assigned +
## 1) + log(engines_assigned + 1) + log(others_units_assigned +
## 1) + inc_class_group:engines_assigned + day_type:time_of_day +
## inc_class_group:log(engines_assigned + 1), data = resp_min_fd.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.409056 -0.197574 -0.002072 0.200759 2.534720
##
## (Dispersion Parameter for gaussian family taken to be 0.1329)
##
## Null Deviance: 3831.721 on 23494 degrees of freedom
## Residual Deviance: 3115.187 on 23441 degrees of freedom
## AIC: 19314.31
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value
## inc_borough 4 127.27 31.82 239.411
## al_source_desc 4 142.87 35.72 268.774
## al_index_desc 2 15.31 7.65 57.592
## highest_al_level 2 52.11 26.05 196.039
## inc_class_group 5 63.86 12.77 96.106
## s(engines_assigned, 5) 1 318.28 318.28 2394.988
## s(ladders_assigned, 5) 1 1.97 1.97 14.829
## s(others_units_assigned, 5) 1 4.15 4.15 31.211
## day_type 1 1.34 1.34 10.048
## time_of_day 3 37.75 12.58 94.682
## tua_is_one 1 90.77 90.77 683.020
## log(ladders_assigned + 1) 1 41.43 41.43 311.786
## log(engines_assigned + 1) 1 60.61 60.61 456.067
## log(others_units_assigned + 1) 1 74.58 74.58 561.164
## inc_class_group:engines_assigned 5 29.95 5.99 45.069
## day_type:time_of_day 3 5.73 1.91 14.364
## inc_class_group:log(engines_assigned + 1) 5 14.35 2.87 21.598
## Residuals 23441 3115.19 0.13
## Pr(>F)
## inc_borough < 2.2e-16 ***
## al_source_desc < 2.2e-16 ***
## al_index_desc < 2.2e-16 ***
## highest_al_level < 2.2e-16 ***
## inc_class_group < 2.2e-16 ***
## s(engines_assigned, 5) < 2.2e-16 ***
## s(ladders_assigned, 5) 0.000118 ***
## s(others_units_assigned, 5) 2.340e-08 ***
## day_type 0.001527 **
## time_of_day < 2.2e-16 ***
## tua_is_one < 2.2e-16 ***
## log(ladders_assigned + 1) < 2.2e-16 ***
## log(engines_assigned + 1) < 2.2e-16 ***
## log(others_units_assigned + 1) < 2.2e-16 ***
## inc_class_group:engines_assigned < 2.2e-16 ***
## day_type:time_of_day 2.398e-09 ***
## inc_class_group:log(engines_assigned + 1) < 2.2e-16 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## inc_borough
## al_source_desc
## al_index_desc
## highest_al_level
## inc_class_group
## s(engines_assigned, 5) 4 24.863 < 2.2e-16 ***
## s(ladders_assigned, 5) 4 23.521 < 2.2e-16 ***
## s(others_units_assigned, 5) 4 10.959 7.066e-09 ***
## day_type
## time_of_day
## tua_is_one
## log(ladders_assigned + 1)
## log(engines_assigned + 1)
## log(others_units_assigned + 1)
## inc_class_group:engines_assigned
## day_type:time_of_day
## inc_class_group:log(engines_assigned + 1)
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(fit.gam_2, se=TRUE, col="blue", lwd=2)
## Warning in preplot.Gam(x, terms = terms): No terms saved for "a:b" style
## interaction terms
obtain.test.ori.rmse <- function(model, x_test, y_test, s = NULL){
if(is.null(s)) {
model.pred <- predict(model, newx = x_test)
} else {
model.pred <- predict(model, s = s, newx = x_test)
}
model.rmse <- sqrt(mean((exp(model.pred) - y_test) ^ 2))
return(model.rmse)
}
x_train <- model.matrix(formula(lm_full_upd_4), data = resp_min_fd.train) #(log(inc_resp_min_qy) ~ . -1
y_train <- resp_min_fd.train$inc_resp_min_qy
x_test <- model.matrix(formula(lm_full_upd_4), data = resp_min_fd.test) #(log(inc_resp_min_qy) ~ . -1
y_test <- resp_min_fd.test$inc_resp_min_qy
gam.pred <- predict(fit.gam_2, newdata = resp_min_fd.test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
gam.rmse <- sqrt(mean((exp(gam.pred) - y_test) ^ 2))
gam.rmse
## [1] 2.657133
regfit.full <- regsubsets(formula(lm_full_upd_4), data = resp_min_fd.train, nvmax = 41)
summ_regfit.full <- summary(regfit.full)
plot(summ_regfit.full$cp, xlab = "Number of variables", ylab = "Cp")
title("Best Subset - log(inc_resp_min_qy)")
min_cp <- which.min(summ_regfit.full$cp)
points(min_cp, summ_regfit.full$cp[min_cp], pch=20, col="red")
fw_regfit.full <- regsubsets(formula(lm_full_upd_4), data = resp_min_fd.train, nvmax = 41, method = "forward")
summ_fw_regfit.full <- summary(fw_regfit.full)
fw_min_cp <- which.min(summ_fw_regfit.full$cp)
bk_regfit.full <- regsubsets(formula(lm_full_upd_4), data = resp_min_fd.train, nvmax = 41, method = "backward")
summ_bk_regfit.full <- summary(bk_regfit.full)
bk_min_cp <- which.min(summ_bk_regfit.full$cp)
sw_regfit.full <- regsubsets(formula(lm_full_upd_4), data = resp_min_fd.train, nvmax = 41, method = "seqrep")
summ_sw_regfit.full <- summary(sw_regfit.full)
sw_min_cp <- which.min(summ_sw_regfit.full$cp)
par(mfrow = c(1, 3))
plot(summ_fw_regfit.full$cp, xlab = "Number of Variables", ylab = "Cp", type = 'l')
points(fw_min_cp, summ_fw_regfit.full$cp[fw_min_cp], pch=20, col="red")
plot(summ_bk_regfit.full$cp, xlab = "Number of Variables", ylab = "Cp", type = 'l')
points(bk_min_cp, summ_bk_regfit.full$cp[bk_min_cp], pch=20, col="red")
plot(summ_sw_regfit.full$cp, xlab = "Number of Variables", ylab = "Cp", type = 'l')
points(sw_min_cp, summ_sw_regfit.full$cp[sw_min_cp], pch=20, col="red")
# non è in scala originale
bs_method_error <- function(model, x_test, y_test) {
val.errors = rep(NA, 41)
for (i in 1:41) {
coefi = coef(model, id=i)
pred = x_test[,names(coefi)]%*%coefi
val.errors[i] = mean((pred - log(y_test))^2)
}
return(sqrt(val.errors))
}
val.errors_bs = bs_method_error(regfit.full, x_test, y_test)
min_rmse = which.min(val.errors_bs)
val.errors_fw = bs_method_error(fw_regfit.full, x_test, y_test)
fw_min_rmse = which.min(val.errors_bs)
val.errors_bk = bs_method_error(bk_regfit.full, x_test, y_test)
bk_min_rmse = which.min(val.errors_bs)
val.errors_sw = bs_method_error(sw_regfit.full, x_test, y_test)
sw_min_rmse = which.min(val.errors_bs)
par(mfrow=c(2,2))
plot(val.errors_bs, ylab="Root MSE", pch=19, type="b")
points(sqrt(regfit.full$rss[-1]/dim(resp_min_fd.train)[1]), col="blue", pch=19,type="b")
legend("topright", legend=c("Training", "Testing"), col=c("blue", "black"), pch=19)
points(min_rmse, val.errors_bs[min_rmse], pch=20, col="red")
title("regfit.full")
plot(val.errors_fw, ylab="Root MSE", pch=19, type="b")
points(sqrt(fw_regfit.full$rss[-1]/dim(resp_min_fd.train)[1]), col="blue", pch=19,type="b")
legend("topright", legend=c("Training", "Testing"), col=c("blue", "black"), pch=19)
points(fw_min_rmse, val.errors_fw[fw_min_rmse], pch=20, col="red")
title("fw_regfit.full")
plot(val.errors_bk, ylab="Root MSE", pch=19, type="b")
points(sqrt(bk_regfit.full$rss[-1]/dim(resp_min_fd.train)[1]), col="blue", pch=19,type="b")
legend("topright", legend=c("Training", "Testing"), col=c("blue", "black"), pch=19)
points(bk_min_rmse, val.errors_bk[bk_min_rmse], pch=20, col="red")
title("bk_regfit.full")
plot(val.errors_sw, ylab="Root MSE", pch=19, type="b")
points(sqrt(sw_regfit.full$rss[-1]/dim(resp_min_fd.train)[1]), col="blue", pch=19,type="b")
legend("topright", legend=c("Training", "Testing"), col=c("blue", "black"), pch=19)
points(sw_min_rmse, val.errors_sw[sw_min_rmse], pch=20, col="red")
title("sw_regfit.full")
val.errors_bs[min_rmse]
## [1] 0.3614087
val.errors_fw[fw_min_rmse]
## [1] 0.3615628
val.errors_bk[bk_min_rmse]
## [1] 0.3615555
val.errors_sw[sw_min_rmse]
## [1] 0.3614087
predict.regsubsets <- function(object, newdata, id, ...){
form <- as.formula(object$call[[2]])
mat <- model.matrix(form, newdata)
coefi <- coef(object, id=id)
mat[, names(coefi)]%*%coefi
}
'set.seed(11)
folds <- sample(rep(1:10, length=nrow(resp_min_fd.train)))
table(folds)
cv.errors <- matrix(NA, 10, 42)
for (k in 1:10) {
best.fit <- regsubsets(formula(lm_full_upd_4), data = resp_min_fd.train[folds != k,], nvmax = 41, method = "seqrep")
for (i in 1:41) {
pred <- predict.regsubsets(best.fit, resp_min_fd.train[folds == k, ], id = i)
cv.errors[k,i] = mean((log(resp_min_fd.train$inc_resp_min_qy[folds == k]) - pred)^2)
}
}
rmse.cv <- sqrt(apply(cv.errors, 2, mean))
true_min <- which.min(rmse.cv)
plot(rmse.cv, pch=19, type="b")
points(true_min , rmse.cv[true_min], pch=20, col="red")'
## [1] "set.seed(11)\nfolds <- sample(rep(1:10, length=nrow(resp_min_fd.train)))\ntable(folds)\ncv.errors <- matrix(NA, 10, 42)\nfor (k in 1:10) {\n best.fit <- regsubsets(formula(lm_full_upd_4), data = resp_min_fd.train[folds != k,], nvmax = 41, method = \"seqrep\")\n for (i in 1:41) {\n pred <- predict.regsubsets(best.fit, resp_min_fd.train[folds == k, ], id = i)\n cv.errors[k,i] = mean((log(resp_min_fd.train$inc_resp_min_qy[folds == k]) - pred)^2)\n }\n}\nrmse.cv <- sqrt(apply(cv.errors, 2, mean))\ntrue_min <- which.min(rmse.cv)\nplot(rmse.cv, pch=19, type=\"b\")\npoints(true_min , rmse.cv[true_min], pch=20, col=\"red\")"
#rmse.cv[true_min]
fit.ridge <- glmnet(x_train, y_train, alpha = 0)
plot(fit.ridge, xvar = "lambda", label = TRUE)
cv.ridge <- cv.glmnet(x_train, y_train, alpha = 0)
plot(cv.ridge)
bestlam.ridge <- cv.ridge$lambda.min
bestlam.ridge
## [1] 0.07339866
plot(fit.ridge, xvar = "lambda") ## notice xvar = "lambda"
abline(v = log(bestlam.ridge), lwd = 1.2, lty = "dashed")
ridge.rmse <- obtain.test.ori.rmse(fit.ridge, x_test, y_test, s = bestlam.ridge)
ridge.rmse
## [1] 833.6745
least.rmse <- obtain.test.ori.rmse(fit.ridge, x_test, y_test)
least.rmse
## [1] 547.3362
fit.lasso <- glmnet(x_train, y_train, alpha = 1)
plot(fit.lasso)
cv.ridge <- cv.glmnet(x_train, y_train, alpha = 1)
plot(cv.ridge) ## lasso path plot
bestlam.lasso <- cv.ridge$lambda.min
plot(fit.lasso, xvar = "lambda") ## notice xvar = "lambda"
abline(v = log(bestlam.lasso), lwd = 1.2, lty = "dashed")
lasso.rmse <- obtain.test.ori.rmse(fit.lasso, x_test, y_test, s = bestlam.lasso)
lasso.rmse
## [1] 1028.172